def train_and_evaluate(model, train_dataloader, val_dataloader, optimizer,
                       loss_fn, metrics, params, model_dir, restore_file=None):
    """Train the model and evaluate every epoch.

    Args:
        model: (torch.nn.Module) the neural network
        params: (Params) hyperparameters
        model_dir: (string) directory containing config, weights and log
        restore_file: (string) - name of file to restore from (without its extension .pth.tar)
    """
    # reload weights from restore_file if specified
    if restore_file is not None:
        restore_path = os.path.join(args.model_dir, args.restore_file + '.pth.tar')
        logging.info("Restoring parameters from {}".format(restore_path))
        utils.load_checkpoint(restore_path, model, optimizer)

    best_val_acc = 0.0

    # learning rate schedulers for different models:
    if params.model_version == "resnet18":
        scheduler = StepLR(optimizer, step_size=150, gamma=0.1)
    # for cnn models, num_epoch is always < 100, so it's intentionally not using scheduler here
    elif params.model_version == "cnn":
        scheduler = StepLR(optimizer, step_size=100, gamma=0.2)

    for epoch in range(params.num_epochs):
     
        scheduler.step()
     
        # Run one epoch
        logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs))

        # compute number of batches in one epoch (one full pass over the training set)
        train(model, optimizer, loss_fn, train_dataloader, metrics, params)

        # Evaluate for one epoch on validation set
        val_metrics = evaluate(model, loss_fn, val_dataloader, metrics, params)        

        val_acc = val_metrics['accuracy']
        is_best = val_acc>=best_val_acc

        # Save weights
        utils.save_checkpoint({'epoch': epoch + 1,
                               'state_dict': model.state_dict(),
                               'optim_dict' : optimizer.state_dict()},
                               is_best=is_best,
                               checkpoint=model_dir)

        # If best_eval, best_save_path
        if is_best:
            logging.info("- Found new best accuracy")
            best_val_acc = val_acc

            # Save best val metrics in a json file in the model directory
            best_json_path = os.path.join(model_dir, "metrics_val_best_weights.json")
            utils.save_dict_to_json(val_metrics, best_json_path)

        # Save latest val metrics in a json file in the model directory
        last_json_path = os.path.join(model_dir, "metrics_val_last_weights.json")
        utils.save_dict_to_json(val_metrics, last_json_path)
Пример #2
0
def main():
    init_platform()
    logger.info('Experiment name: %s', default.exp_name)
    logger.info('Initializing ...')

    if default.mode in ('train', 'infer'):
        train_loader, val_loaders, train_sampler = init_data()
    elif default.mode == 'demo':
        init_demo()

    model, criterions, optimizer = init_model()

    if default.mode == 'infer':
        evaluate(val_loaders, model)
        return
    elif default.mode == 'demo':
        demo_fun(model)
        return

    # mode == 'train'
    global accs, best_acc1
    if default.validate_at_begin:
        acc_all = evaluate(val_loaders, model)
        accs[default.begin_epoch] = acc_all[0][config.TEST.CRITERION]
        best_acc1 = acc_all[0][config.TEST.CRITERION]
        logger.info('iter %d: %.4f' % (default.begin_epoch, accs[default.begin_epoch]))

    for epoch in range(default.begin_epoch, default.epochs):
        if default.distributed:
            train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch)
        # train for one epoch
        train(train_loader, model, criterions, optimizer, epoch)
        fn = save_checkpoint(epoch, model, optimizer, last_bkup=True)

        # evaluate on validation set
        acc_all = evaluate(val_loaders, model)
        accs[epoch+1] = acc_all[0][config.TEST.CRITERION]
        for key in sorted(accs.keys()):
            print 'iter %d: %.4f' % (key, accs[key])

        # remember best acc and save checkpoint
        is_best = accs[epoch+1] > best_acc1
        best_acc1 = max(accs[epoch+1], best_acc1)
        if is_best or not default.keep_best_model:
            new_fn = checkpoint_name(epoch+1, False)
            os.rename(fn, new_fn)
            if default.keep_best_model:
                if os.path.exists(default.best_model_path):
                    os.remove(default.best_model_path)
                default.best_model_path = new_fn
Пример #3
0
def train_net(args):
    torch.manual_seed(7)
    np.random.seed(7)
    checkpoint = args.checkpoint
    start_epoch = 0
    best_acc = 0
    writer = SummaryWriter()
    epochs_since_improvement = 0

    # Initialize / load checkpoint
    if checkpoint is None:
        model = CarRecognitionModel()
        model = nn.DataParallel(model)

        optimizer = torch.optim.SGD(model.parameters(),
                                    lr=args.lr,
                                    momentum=args.momentum,
                                    weight_decay=args.weight_decay,
                                    nesterov=args.nesterov)
        # optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)

    else:
        checkpoint = torch.load(checkpoint)
        start_epoch = checkpoint['epoch'] + 1
        epochs_since_improvement = checkpoint['epochs_since_improvement']
        model = checkpoint['model']
        optimizer = checkpoint['optimizer']

    logger = get_logger()

    # Move to GPU, if available
    model = model.to(device)

    # Loss function
    criterion = nn.CrossEntropyLoss()

    # Custom dataloaders
    train_dataset = CarRecognitionDataset('train')
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=num_workers)
    valid_dataset = CarRecognitionDataset('valid')
    valid_loader = torch.utils.data.DataLoader(valid_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=False,
                                               num_workers=num_workers)

    # Epochs
    for epoch in range(start_epoch, args.end_epoch):
        if epochs_since_improvement > 0 and epochs_since_improvement % patience == 0:
            adjust_learning_rate(optimizer, shrink_factor=0.1)

        lr = get_learning_rate(optimizer)
        logger.info('Learning rate: ' + str(lr))
        writer.add_scalar('model/learning_rate', lr, epoch)

        # One epoch's training
        train_loss, train_acc = train(train_loader=train_loader,
                                      model=model,
                                      criterion=criterion,
                                      optimizer=optimizer,
                                      epoch=epoch,
                                      logger=logger)

        writer.add_scalar('model/train_loss', train_loss, epoch)
        writer.add_scalar('model/train_accuracy', train_acc, epoch)

        # One epoch's validation
        valid_loss, valid_acc = valid(valid_loader=valid_loader,
                                      model=model,
                                      criterion=criterion,
                                      logger=logger)

        writer.add_scalar('model/valid_loss', valid_loss, epoch)
        writer.add_scalar('model/valid_accuracy', valid_acc, epoch)

        # Check if there was an improvement
        is_best = valid_acc > best_acc
        best_acc = max(valid_acc, best_acc)
        if not is_best:
            epochs_since_improvement += 1
            print("\nEpochs since last improvement: %d\n" %
                  (epochs_since_improvement, ))
        else:
            epochs_since_improvement = 0

        # Save checkpoint
        save_checkpoint(epoch, epochs_since_improvement, model, optimizer,
                        best_acc, is_best)
Пример #4
0
def main(rank, args):

    # Distributed setup

    if args.distributed:
        setup_distributed(rank, args.world_size)

    not_main_rank = args.distributed and rank != 0

    logging.info("Start time: %s", datetime.now())

    # Explicitly set seed to make sure models created in separate processes
    # start from same random weights and biases
    torch.manual_seed(args.seed)

    # Empty CUDA cache
    torch.cuda.empty_cache()

    # Change backend for flac files
    torchaudio.set_audio_backend("soundfile")

    # Transforms

    melkwargs = {
        "n_fft": args.win_length,
        "n_mels": args.n_bins,
        "hop_length": args.hop_length,
    }

    sample_rate_original = 16000

    if args.type == "mfcc":
        transforms = torch.nn.Sequential(
            torchaudio.transforms.MFCC(
                sample_rate=sample_rate_original,
                n_mfcc=args.n_bins,
                melkwargs=melkwargs,
            ), )
        num_features = args.n_bins
    elif args.type == "waveform":
        transforms = torch.nn.Sequential(UnsqueezeFirst())
        num_features = 1
    else:
        raise ValueError("Model type not supported")

    if args.normalize:
        transforms = torch.nn.Sequential(transforms, Normalize())

    augmentations = torch.nn.Sequential()
    if args.freq_mask:
        augmentations = torch.nn.Sequential(
            augmentations,
            torchaudio.transforms.FrequencyMasking(
                freq_mask_param=args.freq_mask),
        )
    if args.time_mask:
        augmentations = torch.nn.Sequential(
            augmentations,
            torchaudio.transforms.TimeMasking(time_mask_param=args.time_mask),
        )

    # Text preprocessing

    char_blank = "*"
    char_space = " "
    char_apostrophe = "'"
    labels = char_blank + char_space + char_apostrophe + string.ascii_lowercase
    language_model = LanguageModel(labels, char_blank, char_space)

    # Dataset

    training, validation = split_process_librispeech(
        [args.dataset_train, args.dataset_valid],
        [transforms, transforms],
        language_model,
        root=args.dataset_root,
        folder_in_archive=args.dataset_folder_in_archive,
    )

    # Decoder

    if args.decoder == "greedy":
        decoder = GreedyDecoder()
    else:
        raise ValueError("Selected decoder not supported")

    # Model

    model = Wav2Letter(
        num_classes=language_model.length,
        input_type=args.type,
        num_features=num_features,
    )

    if args.jit:
        model = torch.jit.script(model)

    if args.distributed:
        n = torch.cuda.device_count() // args.world_size
        devices = list(range(rank * n, (rank + 1) * n))
        model = model.to(devices[0])
        model = torch.nn.parallel.DistributedDataParallel(model,
                                                          device_ids=devices)
    else:
        devices = ["cuda" if torch.cuda.is_available() else "cpu"]
        model = model.to(devices[0], non_blocking=True)
        model = torch.nn.DataParallel(model)

    n = count_parameters(model)
    logging.info("Number of parameters: %s", n)

    # Optimizer

    if args.optimizer == "adadelta":
        optimizer = Adadelta(
            model.parameters(),
            lr=args.learning_rate,
            weight_decay=args.weight_decay,
            eps=args.eps,
            rho=args.rho,
        )
    elif args.optimizer == "sgd":
        optimizer = SGD(
            model.parameters(),
            lr=args.learning_rate,
            momentum=args.momentum,
            weight_decay=args.weight_decay,
        )
    elif args.optimizer == "adam":
        optimizer = Adam(
            model.parameters(),
            lr=args.learning_rate,
            momentum=args.momentum,
            weight_decay=args.weight_decay,
        )
    elif args.optimizer == "adamw":
        optimizer = AdamW(
            model.parameters(),
            lr=args.learning_rate,
            momentum=args.momentum,
            weight_decay=args.weight_decay,
        )
    else:
        raise ValueError("Selected optimizer not supported")

    if args.scheduler == "exponential":
        scheduler = ExponentialLR(optimizer, gamma=args.gamma)
    elif args.scheduler == "reduceonplateau":
        scheduler = ReduceLROnPlateau(optimizer, patience=10, threshold=1e-3)
    else:
        raise ValueError("Selected scheduler not supported")

    criterion = torch.nn.CTCLoss(blank=language_model.mapping[char_blank],
                                 zero_infinity=False)

    # Data Loader

    collate_fn_train = collate_factory(model_length_function, augmentations)
    collate_fn_valid = collate_factory(model_length_function)

    loader_training_params = {
        "num_workers": args.workers,
        "pin_memory": True,
        "shuffle": True,
        "drop_last": True,
    }
    loader_validation_params = loader_training_params.copy()
    loader_validation_params["shuffle"] = False

    loader_training = DataLoader(
        training,
        batch_size=args.batch_size,
        collate_fn=collate_fn_train,
        **loader_training_params,
    )
    loader_validation = DataLoader(
        validation,
        batch_size=args.batch_size,
        collate_fn=collate_fn_valid,
        **loader_validation_params,
    )

    # Setup checkpoint

    best_loss = 1.0

    load_checkpoint = args.checkpoint and os.path.isfile(args.checkpoint)

    if args.distributed:
        torch.distributed.barrier()

    if load_checkpoint:
        logging.info("Checkpoint: loading %s", args.checkpoint)
        checkpoint = torch.load(args.checkpoint)

        args.start_epoch = checkpoint["epoch"]
        best_loss = checkpoint["best_loss"]

        model.load_state_dict(checkpoint["state_dict"])
        optimizer.load_state_dict(checkpoint["optimizer"])
        scheduler.load_state_dict(checkpoint["scheduler"])

        logging.info("Checkpoint: loaded '%s' at epoch %s", args.checkpoint,
                     checkpoint["epoch"])
    else:
        logging.info("Checkpoint: not found")

        save_checkpoint(
            {
                "epoch": args.start_epoch,
                "state_dict": model.state_dict(),
                "best_loss": best_loss,
                "optimizer": optimizer.state_dict(),
                "scheduler": scheduler.state_dict(),
            },
            False,
            args.checkpoint,
            not_main_rank,
        )

    if args.distributed:
        torch.distributed.barrier()

    torch.autograd.set_detect_anomaly(False)

    for epoch in range(args.start_epoch, args.epochs):

        logging.info("Epoch: %s", epoch)

        train_one_epoch(
            model,
            criterion,
            optimizer,
            scheduler,
            loader_training,
            decoder,
            language_model,
            devices[0],
            epoch,
            args.clip_grad,
            not_main_rank,
            not args.reduce_lr_valid,
        )

        loss = evaluate(
            model,
            criterion,
            loader_validation,
            decoder,
            language_model,
            devices[0],
            epoch,
            not_main_rank,
        )

        if args.reduce_lr_valid and isinstance(scheduler, ReduceLROnPlateau):
            scheduler.step(loss)

        is_best = loss < best_loss
        best_loss = min(loss, best_loss)
        save_checkpoint(
            {
                "epoch": epoch + 1,
                "state_dict": model.state_dict(),
                "best_loss": best_loss,
                "optimizer": optimizer.state_dict(),
                "scheduler": scheduler.state_dict(),
            },
            is_best,
            args.checkpoint,
            not_main_rank,
        )

    logging.info("End time: %s", datetime.now())

    if args.distributed:
        torch.distributed.destroy_process_group()
Пример #5
0
        train()
        if 't0' in optimizer.param_groups[0]:
            tmp = {}
            for prm in model.parameters():
                tmp[prm] = prm.data.clone()
                prm.data = optimizer.state[prm]['ax'].clone()

            val_loss2 = evaluate(val_data)
            logging('-' * 89)
            logging('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                    'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                               val_loss2, math.exp(val_loss2)))
            logging('-' * 89)

            if val_loss2 < stored_loss:
                save_checkpoint(model, optimizer, args.save, finetune=True)
                logging('Saving Averaged!')
                stored_loss = val_loss2

            for prm in model.parameters():
                prm.data = tmp[prm].clone()

        if (len(best_val_loss)>args.nonmono and val_loss2 > min(best_val_loss[:-args.nonmono])):
            logging('Done!')
            break
            optimizer = torch.optim.ASGD(model.parameters(), lr=args.lr, t0=0, lambd=0., weight_decay=args.wdecay)
            #optimizer.param_groups[0]['lr'] /= 2.
        best_val_loss.append(val_loss2)

except KeyboardInterrupt:
    logging('-' * 89)
Пример #6
0
        if 't0' in optimizer.param_groups[0]:
            tmp = {}
            for prm in model.parameters():
                tmp[prm] = prm.data.clone()
                prm.data = optimizer.state[prm]['ax'].clone()

            val_loss2 = evaluate(val_data)
            logging('-' * 89)
            logging('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                    'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                               val_loss2, math.exp(val_loss2)))
            logging('-' * 89)

            val_perp_list.append(math.exp(val_loss2))
            if val_loss2 < stored_loss:
                save_checkpoint(model, optimizer, args.save)
                logging('Saving Averaged!')
                stored_loss = val_loss2

            for prm in model.parameters():
                prm.data = tmp[prm].clone()

        else:
            val_loss = evaluate(val_data, eval_batch_size)
            logging('-' * 89)
            logging('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                    'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                               val_loss, math.exp(val_loss)))
            logging('-' * 89)

            val_perp_list.append(math.exp(val_loss))
for epoch in range(start_epoch, start_epoch+50):
    time_ep = time.time()

    lr = args.lr_set
    utils.adjust_learning_rate(optimizer, lr)
    train_res = utils.train_epoch(loaders['train'], model, criterion, optimizer)

    test_res = utils.eval(loaders['test'], model, criterion)


    if train_res['loss']<train_res_swa['loss'] and test_res['loss']>test_res_swa['loss']:
        print('find',file=f_out)
        print('find')
        utils.save_checkpoint(
            args.dir,
            epoch + 1,
            state_dict=model.state_dict(),
            optimizer=optimizer.state_dict()
        )


    time_ep = time.time() - time_ep
    values = [epoch + 1, lr, train_res['loss'], train_res['accuracy'], test_res['loss'], test_res['accuracy'], time_ep]

    table = tabulate.tabulate([values], columns, tablefmt='simple', floatfmt='8.4f')
    if epoch % 40 == 0:
        table = table.split('\n')
        table = '\n'.join([table[1]] + table)
    else:
        table = table.split('\n')[2]
    print(table, file = f_out)
    print(table)
Пример #8
0
ensemble_size = 0
predictions_sum = np.zeros((len(loaders['test'].dataset), num_classes))

columns = [
    'ep', 'lr', 'tr_loss', 'tr_acc', 'te_nll', 'te_acc', 'ens_acc', 'time'
]

if args.regularizer is None:
    regularizer = None
elif args.regularizer == 'MSE2':
    regularizer = regularization.TwoModelsMSE(model, args.reg_wd).reg

utils.save_checkpoint(args.dir,
                      start_epoch,
                      name='fge',
                      model_state=model.state_dict(),
                      optimizer_state=optimizer.state_dict())

for epoch in range(args.epochs):
    time_ep = time.time()
    lr_schedule = utils.cyclic_learning_rate(epoch, args.cycle, args.lr_1,
                                             args.lr_2)
    if args.weighted_samples is None:
        train_res = utils.train(loaders['train'],
                                model,
                                optimizer,
                                criterion,
                                lr_schedule=lr_schedule,
                                regularizer=regularizer)
    else:
Пример #9
0
def main(args, logger):
    # trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv')
    trn_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/trn_df.pkl')
    trn_df['is_original'] = 1
    # raw_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/raw_pseudo_tst_df.csv')
    # half_opt_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/half_opt_pseudo_tst_df.csv')
    # opt_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/opt_pseudo_tst_df.csv')

    # clean texts
    # trn_df = clean_data(trn_df, ['question_title', 'question_body', 'answer'])

    # load additional tokens
    # with open('./mnt/inputs/nes_info/trn_over_10_vocab.pkl', 'rb') as fin:
    #     additional_tokens = pickle.load(fin)

    gkf = GroupKFold(n_splits=5).split(
        X=trn_df.question_body,
        groups=trn_df.question_body_le,
    )

    histories = {
        'trn_loss': {},
        'val_loss': {},
        'val_metric': {},
        'val_metric_raws': {},
    }
    loaded_fold = -1
    loaded_epoch = -1
    if args.checkpoint:
        histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint)

    fold_best_metrics = []
    fold_best_metrics_raws = []
    for fold, (trn_idx, val_idx) in enumerate(gkf):
        if fold < loaded_fold:
            fold_best_metrics.append(np.max(histories["val_metric"][fold]))
            fold_best_metrics_raws.append(
                histories["val_metric_raws"][fold][np.argmax(
                    histories["val_metric"][fold])])
            continue
        sel_log(
            f' --------------------------- start fold {fold} --------------------------- ',
            logger)
        fold_trn_df = trn_df.iloc[trn_idx]  # .query('is_original == 1')
        fold_trn_df = fold_trn_df.drop(['is_original', 'question_body_le'],
                                       axis=1)
        # use only original row
        fold_val_df = trn_df.iloc[val_idx].query('is_original == 1')
        fold_val_df = fold_val_df.drop(['is_original', 'question_body_le'],
                                       axis=1)
        if args.debug:
            fold_trn_df = fold_trn_df.sample(100, random_state=71)
            fold_val_df = fold_val_df.sample(100, random_state=71)
        temp = pd.Series(
            list(
                itertools.chain.from_iterable(
                    fold_trn_df.question_title.apply(lambda x: x.split(' ')) +
                    fold_trn_df.question_body.apply(lambda x: x.split(' ')) +
                    fold_trn_df.answer.apply(lambda x: x.split(' '))))
        ).value_counts()
        tokens = temp[temp >= 10].index.tolist()
        # tokens = []
        tokens = [
            'CAT_TECHNOLOGY'.casefold(),
            'CAT_STACKOVERFLOW'.casefold(),
            'CAT_CULTURE'.casefold(),
            'CAT_SCIENCE'.casefold(),
            'CAT_LIFE_ARTS'.casefold(),
        ]  #  + additional_tokens

        # fold_trn_df = pd.concat([fold_trn_df, opt_pseudo_df, half_opt_pseudo_df], axis=0)

        trn_dataset = QUESTDataset(
            df=fold_trn_df,
            mode='train',
            tokens=tokens,
            augment=[],
            tokenizer_type=TOKENIZER_TYPE,
            pretrained_model_name_or_path=TOKENIZER_PRETRAIN,
            do_lower_case=DO_LOWER_CASE,
            LABEL_COL=LABEL_COL,
            t_max_len=T_MAX_LEN,
            q_max_len=Q_MAX_LEN,
            a_max_len=A_MAX_LEN,
            tqa_mode=TQA_MODE,
            TBSEP='[TBSEP]',
            pos_id_type='arange',
            MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN,
        )
        # update token
        trn_sampler = RandomSampler(data_source=trn_dataset)
        trn_loader = DataLoader(trn_dataset,
                                batch_size=BATCH_SIZE,
                                sampler=trn_sampler,
                                num_workers=os.cpu_count(),
                                worker_init_fn=lambda x: np.random.seed(),
                                drop_last=True,
                                pin_memory=True)
        val_dataset = QUESTDataset(
            df=fold_val_df,
            mode='valid',
            tokens=tokens,
            augment=[],
            tokenizer_type=TOKENIZER_TYPE,
            pretrained_model_name_or_path=TOKENIZER_PRETRAIN,
            do_lower_case=DO_LOWER_CASE,
            LABEL_COL=LABEL_COL,
            t_max_len=T_MAX_LEN,
            q_max_len=Q_MAX_LEN,
            a_max_len=A_MAX_LEN,
            tqa_mode=TQA_MODE,
            TBSEP='[TBSEP]',
            pos_id_type='arange',
            MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN,
        )
        val_sampler = RandomSampler(data_source=val_dataset)
        val_loader = DataLoader(val_dataset,
                                batch_size=BATCH_SIZE,
                                sampler=val_sampler,
                                num_workers=os.cpu_count(),
                                worker_init_fn=lambda x: np.random.seed(),
                                drop_last=False,
                                pin_memory=True)

        fobj = BCEWithLogitsLoss()
        state_dict = BertModel.from_pretrained(MODEL_PRETRAIN).state_dict()
        model = BertModelForBinaryMultiLabelClassifier(
            num_labels=len(LABEL_COL),
            config_path=MODEL_CONFIG_PATH,
            state_dict=state_dict,
            token_size=len(trn_dataset.tokenizer),
            MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN,
            cat_last_layer_num=1,
            do_ratio=0.2,
        )
        optimizer = optim.Adam(model.parameters(), lr=3e-5)
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                         T_max=MAX_EPOCH,
                                                         eta_min=1e-5)

        # load checkpoint model, optim, scheduler
        if args.checkpoint and fold == loaded_fold:
            load_checkpoint(args.checkpoint, model, optimizer, scheduler)

        for epoch in tqdm(list(range(MAX_EPOCH))):
            if fold <= loaded_fold and epoch <= loaded_epoch:
                continue
            if epoch < 1:
                model.freeze_unfreeze_bert(freeze=True, logger=logger)
            else:
                model.freeze_unfreeze_bert(freeze=False, logger=logger)
            model = DataParallel(model)
            model = model.to(DEVICE)
            trn_loss = train_one_epoch(model, fobj, optimizer, trn_loader,
                                       DEVICE)
            val_loss, val_metric, val_metric_raws, val_y_preds, val_y_trues, val_qa_ids = test(
                model, fobj, val_loader, DEVICE, mode='valid')

            scheduler.step()
            if fold in histories['trn_loss']:
                histories['trn_loss'][fold].append(trn_loss)
            else:
                histories['trn_loss'][fold] = [
                    trn_loss,
                ]
            if fold in histories['val_loss']:
                histories['val_loss'][fold].append(val_loss)
            else:
                histories['val_loss'][fold] = [
                    val_loss,
                ]
            if fold in histories['val_metric']:
                histories['val_metric'][fold].append(val_metric)
            else:
                histories['val_metric'][fold] = [
                    val_metric,
                ]
            if fold in histories['val_metric_raws']:
                histories['val_metric_raws'][fold].append(val_metric_raws)
            else:
                histories['val_metric_raws'][fold] = [
                    val_metric_raws,
                ]

            logging_val_metric_raws = ''
            for val_metric_raw in val_metric_raws:
                logging_val_metric_raws += f'{float(val_metric_raw):.4f}, '

            sel_log(
                f'fold : {fold} -- epoch : {epoch} -- '
                f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- '
                f'val_loss : {float(val_loss.detach().to("cpu").numpy()):.4f} -- '
                f'val_metric : {float(val_metric):.4f} -- '
                f'val_metric_raws : {logging_val_metric_raws}', logger)
            model = model.to('cpu')
            model = model.module
            save_checkpoint(
                f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}',
                model,
                optimizer,
                scheduler,
                histories,
                val_y_preds,
                val_y_trues,
                val_qa_ids,
                fold,
                epoch,
                val_loss,
                val_metric,
            )
        fold_best_metrics.append(np.max(histories["val_metric"][fold]))
        fold_best_metrics_raws.append(
            histories["val_metric_raws"][fold][np.argmax(
                histories["val_metric"][fold])])
        save_and_clean_for_prediction(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}',
                                      trn_dataset.tokenizer,
                                      clean=False)
        del model

    # calc training stats
    fold_best_metric_mean = np.mean(fold_best_metrics)
    fold_best_metric_std = np.std(fold_best_metrics)
    fold_stats = f'{EXP_ID} : {fold_best_metric_mean:.4f} +- {fold_best_metric_std:.4f}'
    sel_log(fold_stats, logger)
    send_line_notification(fold_stats)

    fold_best_metrics_raws_mean = np.mean(fold_best_metrics_raws, axis=0)
    fold_raw_stats = ''
    for metric_stats_raw in fold_best_metrics_raws_mean:
        fold_raw_stats += f'{float(metric_stats_raw):.4f},'
    sel_log(fold_raw_stats, logger)
    send_line_notification(fold_raw_stats)

    sel_log('now saving best checkpoints...', logger)
def train_and_evaluate(model,
                       train_data,
                       val_data,
                       optimizer,
                       scheduler,
                       params,
                       model_dir,
                       restore_file=None):
    """Train the model and evaluate every epoch."""
    # reload weights from restore_file if specified
    if restore_file is not None:
        restore_path = os.path.join(args.model_dir,
                                    args.restore_file + '.pth.tar')
        logging.info("Restoring parameters from {}".format(restore_path))
        utils.load_checkpoint(restore_path, model, optimizer)

    best_val_loss = 0.0
    patience_counter = 0

    for epoch in range(1, params.epoch_num + 1):
        # Run one epoch
        logging.info("Epoch {}/{}".format(epoch, params.epoch_num))

        # Compute number of batches in one epoch
        params.train_steps = params.train_size // params.batch_size
        params.val_steps = params.val_size // params.batch_size

        # data iterator for training
        train_data_iterator = data_loader.data_iterator(train_data,
                                                        shuffle=True)
        # Train for one epoch on training set
        train(model, train_data_iterator, optimizer, scheduler, params)

        # data iterator for evaluation
        train_data_iterator = data_loader.data_iterator(train_data,
                                                        shuffle=False)
        val_data_iterator = data_loader.data_iterator(val_data, shuffle=False)

        # Evaluate for one epoch on training set and validation set
        params.eval_steps = params.train_steps
        train_metrics = evaluate(model,
                                 train_data_iterator,
                                 params,
                                 mark='Train')
        params.eval_steps = params.val_steps
        val_metrics = evaluate(model, val_data_iterator, params, mark='Val')
        print("val metrics :", val_metrics)
        val_loss = val_metrics['loss']
        improve_loss = val_loss - best_val_loss

        # Save weights of the network
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        optimizer_to_save = optimizer.optimizer if args.fp16 else optimizer
        # model_dir = os.path.join(model_dir,"weightedA")
        utils.save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': model_to_save.state_dict(),
                'optim_dict': optimizer_to_save.state_dict()
            },
            is_best=improve_loss > 0,
            checkpoint=model_dir)
        if improve_loss > 0:
            logging.info("- Found new best loss")
            best_val_loss = val_loss
            if improve_loss < params.patience:
                patience_counter += 1
            else:
                patience_counter = 0
        else:
            patience_counter += 1

        # Early stopping and logging best f1
        if (patience_counter >= params.patience_num
                and epoch > params.min_epoch_num) or epoch == params.epoch_num:
            logging.info("Best val loss: {:05.2f}".format(best_val_loss))
            break
Пример #11
0
def train(train_loader, val_multi_loader, model, criterion, optimizer,
          lr_scheduler, start_iter, tb_logger):

    global best_loss

    batch_time = AverageMeter(config.print_freq)
    fw_time = AverageMeter(config.print_freq)
    bp_time = AverageMeter(config.print_freq)
    sy_time = AverageMeter(config.print_freq)
    step_time = AverageMeter(config.print_freq)
    data_time = AverageMeter(config.print_freq)
    losses = AverageMeter(config.print_freq)
    top1 = AverageMeter(config.print_freq)
    top2 = AverageMeter(config.print_freq)

    # switch to train mode
    model.train()

    world_size = dist.get_world_size()
    rank = dist.get_rank()

    logger = logging.getLogger('global_logger')

    end = time.time()

    for i, (input, target) in enumerate(train_loader):
        curr_step = start_iter + i
        lr_scheduler.step(curr_step)
        current_lr = lr_scheduler.get_lr()[0]

        # measure data loading time
        data_time.update(time.time() - end)

        # transfer input to gpu
        target = target.cuda()
        input = input.cuda()

        # forward
        output = model(input)
        loss = criterion(output, target) / world_size

        # measure accuracy and record loss
        prec1, prec2 = accuracy(output, target, topk=(1, 2))

        reduced_loss = loss.clone()
        reduced_prec1 = prec1.clone() / world_size
        reduced_prec2 = prec2.clone() / world_size

        dist.all_reduce(reduced_loss)
        dist.all_reduce(reduced_prec1)
        dist.all_reduce(reduced_prec2)

        losses.update(reduced_loss.item())
        top1.update(reduced_prec1.item())
        top2.update(reduced_prec2.item())

        # backward
        optimizer.zero_grad()

        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)

        if curr_step % config.print_freq == 0 and rank == 0:
            total_batch_size = world_size * args.batch_size
            epoch = (curr_step * total_batch_size) // len(train_loader.dataset)
            tb_logger.add_scalar('loss_train', losses.avg, curr_step)
            tb_logger.add_scalar('acc1_train', top1.avg, curr_step)
            tb_logger.add_scalar('acc2_train', top2.avg, curr_step)
            tb_logger.add_scalar('lr', current_lr, curr_step)
            logger.info('Iter: [{0}/{1}]\t'
                        'Epoch: {2}\t'
                        'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                        'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                        'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                        'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                        'LR {lr:.4f}'.format(curr_step,
                                             len(train_loader),
                                             epoch,
                                             batch_time=batch_time,
                                             data_time=data_time,
                                             loss=losses,
                                             top1=top1,
                                             lr=current_lr))

        if curr_step > 0 and curr_step % config.val_freq == 0:
            if not args.no_val:
                total_loss = 0
                for dataset_idx in range(len(val_multi_loader)):
                    val_loss, prec1, prec2 = validate(
                        dataset_idx,
                        val_multi_loader[dataset_idx],
                        model,
                        criterion,
                        tb_logger,
                        curr_step=curr_step,
                        save_softmax=True)

                    total_loss += val_loss
                # average loss over multiple validation sets
                if len(val_multi_loader) > 0:
                    loss = total_loss / len(val_multi_loader)
            else:
                loss = 1e9

            if rank == 0:
                # remember best video logloss recorded at rank 0 and save checkpoint
                is_best = loss < best_loss
                best_loss = min(loss, best_loss)
                save_checkpoint(
                    {
                        'step': curr_step,
                        'arch': config.model.arch,
                        'state_dict': model.state_dict(),
                        'best_loss': best_loss,
                        'optimizer': optimizer.state_dict(),
                    }, is_best, args.save_path_dated + '/ckpt')

        end = time.time()
Пример #12
0
def train_and_evaluate(model,
                       train_dataloader,
                       val_dataloader,
                       optimizer,
                       loss_fn,
                       metrics,
                       params,
                       model_dir,
                       restore_file=None):
    """Train the model and evaluate every epoch.

    Args:
        model: (torch.nn.Module) the neural network
        train_dataloader: (DataLoader) a torch.utils.data.DataLoader object that fetches training data
        val_dataloader: (DataLoader) a torch.utils.data.DataLoader object that fetches validation data
        optimizer: (torch.optim) optimizer for parameters of model
        loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch
        metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch
        params: (Params) hyperparameters
        model_dir: (string) directory containing config, weights and log
        restore_file: (string) optional- name of file to restore from (without its extension .pth.tar)
    """
    # reload weights from restore_file if specified
    if restore_file is not None:
        restore_path = os.path.join(args.model_dir,
                                    args.restore_file + '.pth.tar')
        logging.info("Restoring parameters from {}".format(restore_path))
        utils.load_checkpoint(restore_path, model, optimizer)

    best_val_acc = 0.0

    for epoch in range(params.num_epochs):
        # Run one epoch
        logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs))

        # compute number of batches in one epoch (one full pass over the training set)
        train(model, optimizer, loss_fn, train_dataloader, metrics, params)

        # Evaluate for one epoch on validation set
        val_metrics = evaluate(model, loss_fn, val_dataloader, metrics, params)

        val_acc = val_metrics['accuracy']
        is_best = val_acc >= best_val_acc

        # Save weights
        utils.save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'optim_dict': optimizer.state_dict()
            },
            is_best=is_best,
            checkpoint=model_dir)

        # If best_eval, best_save_path
        if is_best:
            logging.info("- Found new best accuracy")
            best_val_acc = val_acc

            # Save best val metrics in a json file in the model directory
            best_json_path = os.path.join(model_dir,
                                          "metrics_val_best_weights.json")
            utils.save_dict_to_json(val_metrics, best_json_path)

        # Save latest val metrics in a json file in the model directory
        last_json_path = os.path.join(model_dir,
                                      "metrics_val_last_weights.json")
        utils.save_dict_to_json(val_metrics, last_json_path)
Пример #13
0
def main():
    global best_acc, use_apex, mean, std, scale

    args = parse_args()
    args.mean, args.std, args.scale, args.use_apex = mean, std, scale, use_apex
    args.is_master = args.local_rank == 0

    if args.deterministic:
        cudnn.deterministic = True
        torch.manual_seed(0)
        random.seed(0)
        np.random.seed(0)

    args.distributed = False
    if 'WORLD_SIZE' in os.environ:
        args.distributed = int(os.environ['WORLD_SIZE']) > 1 and args.use_apex

    if args.is_master:
        print("opt_level = {}".format(args.opt_level))
        print("keep_batchnorm_fp32 = {}".format(args.keep_batchnorm_fp32),
              type(args.keep_batchnorm_fp32))
        print("loss_scale = {}".format(args.loss_scale), type(args.loss_scale))
        print("\nCUDNN VERSION: {}\n".format(torch.backends.cudnn.version()))
        print(f"Use Apex: {args.use_apex}")
        print(f"Distributed Training Enabled: {args.distributed}")

    args.gpu = 0
    args.world_size = 1

    if args.distributed:
        args.gpu = args.local_rank
        torch.cuda.set_device(args.gpu)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')
        args.world_size = torch.distributed.get_world_size()
        # Scale learning rate based on global batch size
        # args.lr *= args.batch_size * args.world_size / 256

    if args.use_apex:
        assert torch.backends.cudnn.enabled, "Amp requires cudnn backend to be enabled."

    # create model
    model = models.ResNet18(args.num_patches, args.num_angles)

    if args.sync_bn:
        import apex
        print("using apex synced BN")
        model = apex.parallel.convert_syncbn_model(model)

    model = model.cuda()
    optimiser = Ranger(model.parameters(), lr=args.lr)
    criterion = nn.CrossEntropyLoss().cuda()

    # Initialize Amp.  Amp accepts either values or strings for the optional override arguments,
    # for convenient interoperation with argparse.
    if args.use_apex:
        model, optimiser = amp.initialize(
            model,
            optimiser,
            opt_level=args.opt_level,
            keep_batchnorm_fp32=args.keep_batchnorm_fp32,
            loss_scale=args.loss_scale)

    # For distributed training, wrap the model with apex.parallel.DistributedDataParallel.
    # This must be done AFTER the call to amp.initialize.  If model = DDP(model) is called
    # before model, ... = amp.initialize(model, ...), the call to amp.initialize may alter
    # the types of model's parameters in a way that disrupts or destroys DDP's allreduce hooks.
    if args.distributed:
        model = DDP(model, delay_allreduce=True)
    else:
        model = nn.DataParallel(model)

    # Optionally resume from a checkpoint
    if args.resume:
        # Use a local scope to avoid dangling references
        def resume():
            global best_acc
            if os.path.isfile(args.resume):
                print("=> loading checkpoint '{}'".format(args.resume))
                checkpoint = torch.load(
                    args.resume,
                    map_location=lambda storage, loc: storage.cuda(args.gpu))
                args.start_epoch = checkpoint['epoch']
                best_acc = checkpoint['best_acc']
                args.poisson_rate = checkpoint["poisson_rate"]
                model.load_state_dict(checkpoint['state_dict'])
                optimiser.load_state_dict(checkpoint['optimiser'])
                print("=> loaded checkpoint '{}' (epoch {})".format(
                    args.resume, checkpoint['epoch']))
            else:
                print("=> no checkpoint found at '{}'".format(args.resume))

        resume()

    if args.do_ssl:
        stl_unlabeled = datasets.STL10(root=args.data,
                                       split='unlabeled',
                                       download=args.download)
        indices = list(range(len(stl_unlabeled)))
        train_indices = indices[:int(len(indices) * 0.9)]
        val_indices = indices[int(len(indices) * 0.9):]
        train_dataset = SSLTrainDataset(Subset(stl_unlabeled, train_indices),
                                        args.num_patches, args.num_angles,
                                        args.poisson_rate)
        val_dataset = SSLValDataset(Subset(stl_unlabeled, val_indices),
                                    args.num_patches, args.num_angles)

        train_sampler = None
        val_sampler = None
        if args.distributed:
            train_sampler = torch.utils.data.distributed.DistributedSampler(
                train_dataset)
            val_sampler = torch.utils.data.distributed.DistributedSampler(
                val_dataset)

        train_loader = DataLoader(train_dataset,
                                  batch_size=args.batch_size,
                                  shuffle=(train_sampler is None),
                                  num_workers=args.workers,
                                  pin_memory=True,
                                  sampler=train_sampler,
                                  collate_fn=fast_collate)

        val_loader = DataLoader(val_dataset,
                                batch_size=args.batch_size,
                                shuffle=False,
                                num_workers=args.workers,
                                pin_memory=True,
                                sampler=val_sampler,
                                collate_fn=fast_collate)

        if args.evaluate:
            rot_val_loss, rot_val_acc, perm_val_loss, perm_val_acc = apex_validate(
                val_loader, model, criterion, args)
            if args.is_master:
                utils.logger.info(
                    f"Rot Val Loss = {rot_val_loss}, Rot Val Accuracy = {rot_val_acc}"
                )
                utils.logger.info(
                    f"Perm Val Loss = {perm_val_loss}, Perm Val Accuracy = {perm_val_acc}"
                )
            return

        # Create dir to save model and command-line args
        if args.is_master:
            model_dir = time.ctime().replace(" ", "_").replace(":", "_")
            model_dir = os.path.join("models", model_dir)
            os.makedirs(model_dir, exist_ok=True)
            with open(os.path.join(model_dir, "args.json"), "w") as f:
                json.dump(args.__dict__, f, indent=2)
            writer = SummaryWriter()

        for epoch in range(args.start_epoch, args.epochs):
            if args.distributed:
                train_sampler.set_epoch(epoch)

            # train for one epoch
            rot_train_loss, rot_train_acc, perm_train_loss, perm_train_acc = apex_train(
                train_loader, model, criterion, optimiser, args, epoch)

            # evaluate on validation set
            rot_val_loss, rot_val_acc, perm_val_loss, perm_val_acc = apex_validate(
                val_loader, model, criterion, args)

            if (epoch + 1) % args.learn_prd == 0:
                args.poisson_rate += 1
                train_loader.dataset.set_poisson_rate(args.poisson_rate)

            # remember best Acc and save checkpoint
            if args.is_master:
                is_best = perm_val_acc > best_acc
                best_acc = max(perm_val_acc, best_acc)
                save_checkpoint(
                    {
                        'epoch': epoch + 1,
                        'state_dict': model.state_dict(),
                        'best_acc': best_acc,
                        'optimiser': optimiser.state_dict(),
                        "poisson_rate": args.poisson_rate
                    }, is_best, model_dir)

                writer.add_scalars("Rot_Loss", {
                    "train_loss": rot_train_loss,
                    "val_loss": rot_val_loss
                }, epoch)
                writer.add_scalars("Perm_Loss", {
                    "train_loss": perm_train_loss,
                    "val_loss": perm_val_loss
                }, epoch)
                writer.add_scalars("Rot_Accuracy", {
                    "train_acc": rot_train_acc,
                    "val_acc": rot_val_acc
                }, epoch)
                writer.add_scalars("Perm_Accuracy", {
                    "train_acc": perm_train_acc,
                    "val_acc": perm_val_acc
                }, epoch)
                writer.add_scalar("Poisson_Rate",
                                  train_loader.dataset.pdist.rate, epoch)
Пример #14
0
def main():
    hostname = socket.gethostname()
    setup_logging(os.path.join(args.results_dir, 'log_{}.txt'.format(hostname)))
    logging.info("running arguments: %s", args)

    best_gpu = setup_gpus()
    torch.cuda.set_device(best_gpu)
    torch.backends.cudnn.benchmark = True

    train_transform = get_transform(args.dataset, 'train')
    train_data = get_dataset(args.dataset, args.train_split, train_transform)
    train_loader = torch.utils.data.DataLoader(train_data,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=args.workers,
                                               pin_memory=True)

    val_transform = get_transform(args.dataset, 'val')
    val_data = get_dataset(args.dataset, 'val', val_transform)
    val_loader = torch.utils.data.DataLoader(val_data,
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True)

    bit_width_list = list(map(int, args.bit_width_list.split(',')))
    bit_width_list.sort()
    model = models.__dict__[args.model](bit_width_list, train_data.num_classes).cuda()

    lr_decay = list(map(int, args.lr_decay.split(',')))
    optimizer = get_optimizer_config(model, args.optimizer, args.lr, args.weight_decay)
    lr_scheduler = None
    best_prec1 = None
    if args.resume and args.resume != 'None':
        if os.path.isdir(args.resume):
            args.resume = os.path.join(args.resume, 'model_best.pth.tar')
        if os.path.isfile(args.resume):
            checkpoint = torch.load(args.resume, map_location='cuda:{}'.format(best_gpu))
            args.start_epoch = checkpoint['epoch']
            best_prec1 = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            lr_scheduler = get_lr_scheduler(args.optimizer, optimizer, lr_decay, checkpoint['epoch'])
            logging.info("loaded resume checkpoint '%s' (epoch %s)", args.resume, checkpoint['epoch'])
        else:
            raise ValueError('Pretrained model path error!')
    elif args.pretrain and args.pretrain != 'None':
        if os.path.isdir(args.pretrain):
            args.pretrain = os.path.join(args.pretrain, 'model_best.pth.tar')
        if os.path.isfile(args.pretrain):
            checkpoint = torch.load(args.pretrain, map_location='cuda:{}'.format(best_gpu))
            model.load_state_dict(checkpoint['state_dict'], strict=False)
            logging.info("loaded pretrain checkpoint '%s' (epoch %s)", args.pretrain, checkpoint['epoch'])
        else:
            raise ValueError('Pretrained model path error!')
    if lr_scheduler is None:
        lr_scheduler = get_lr_scheduler(args.optimizer, optimizer, lr_decay)
    num_parameters = sum([l.nelement() for l in model.parameters()])
    logging.info("number of parameters: %d", num_parameters)

    criterion = nn.CrossEntropyLoss().cuda()
    criterion_soft = CrossEntropyLossSoft().cuda()
    sum_writer = SummaryWriter(args.results_dir + '/summary')

    for epoch in range(args.start_epoch, args.epochs):
        model.train()
        train_loss, train_prec1, train_prec5 = forward(train_loader, model, criterion, criterion_soft, epoch, True,
                                                       optimizer, sum_writer)
        model.eval()
        val_loss, val_prec1, val_prec5 = forward(val_loader, model, criterion, criterion_soft, epoch, False)

        if isinstance(lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
            lr_scheduler.step(val_loss)
        else:
            lr_scheduler.step()

        if best_prec1 is None:
            is_best = True
            best_prec1 = val_prec1[-1]
        else:
            is_best = val_prec1[-1] > best_prec1
            best_prec1 = max(val_prec1[-1], best_prec1)
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'model': args.model,
                'state_dict': model.state_dict(),
                'best_prec1': best_prec1,
                'optimizer': optimizer.state_dict()
            },
            is_best,
            path=args.results_dir + '/ckpt')

        if sum_writer is not None:
            sum_writer.add_scalar('lr', optimizer.param_groups[0]['lr'], global_step=epoch)
            for bw, tl, tp1, tp5, vl, vp1, vp5 in zip(bit_width_list, train_loss, train_prec1, train_prec5, val_loss,
                                                      val_prec1, val_prec5):
                sum_writer.add_scalar('train_loss_{}'.format(bw), tl, global_step=epoch)
                sum_writer.add_scalar('train_prec_1_{}'.format(bw), tp1, global_step=epoch)
                sum_writer.add_scalar('train_prec_5_{}'.format(bw), tp5, global_step=epoch)
                sum_writer.add_scalar('val_loss_{}'.format(bw), vl, global_step=epoch)
                sum_writer.add_scalar('val_prec_1_{}'.format(bw), vp1, global_step=epoch)
                sum_writer.add_scalar('val_prec_5_{}'.format(bw), vp5, global_step=epoch)
        logging.info('Epoch {}: \ntrain loss {:.2f}, train prec1 {:.2f}, train prec5 {:.2f}\n'
                     '  val loss {:.2f},   val prec1 {:.2f},   val prec5 {:.2f}'.format(
                         epoch, train_loss[-1], train_prec1[-1], train_prec5[-1], val_loss[-1], val_prec1[-1],
                         val_prec5[-1]))
Пример #15
0
        swa_state_dict = checkpoint['swa_state_dict']
        if swa_state_dict is not None:
            swa_model.load_state_dict(swa_state_dict)
        swa_n_ckpt = checkpoint['swa_n']
        if swa_n_ckpt is not None:
            swa_n = swa_n_ckpt

columns = ['ep', 'lr', 'tr_loss', 'tr_acc', 'te_loss', 'te_acc', 'time']
if args.swa:
    columns = columns[:-1] + ['swa_te_loss', 'swa_te_acc'] + columns[-1:]
    swa_res = {'loss': None, 'accuracy': None}

utils.save_checkpoint(
    args.dir,
    start_epoch,
    state_dict=model.state_dict(),
    swa_state_dict=swa_model.state_dict() if args.swa else None,
    swa_n=swa_n if args.swa else None,
    optimizer=optimizer.state_dict()
)

for epoch in range(start_epoch, args.epochs):
    time_ep = time.time()

    lr = schedule(epoch)
    utils.adjust_learning_rate(optimizer, lr)
    train_res = utils.train_epoch(loaders['train'], model, criterion, optimizer)
    if epoch == 0 or epoch % args.eval_freq == args.eval_freq - 1 or epoch == args.epochs - 1:
        test_res = utils.eval(loaders['test'], model, criterion)
    else:
        test_res = {'loss': None, 'accuracy': None}
Пример #16
0
def train_and_evaluate(model,
                       train_data,
                       val_data,
                       optimizer,
                       loss_fn,
                       metrics,
                       params,
                       model_dir,
                       restore_file=None):
    """Train the model and evaluate every epoch.

    Args:
        model: (torch.nn.Module) the neural network
        train_data: (dict) training data with keys 'data' and 'labels'
        val_data: (dict) validaion data with keys 'data' and 'labels'
        optimizer: (torch.optim) optimizer for parameters of model
        loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch
        metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch
        params: (Params) hyperparameters
        model_dir: (string) directory containing config, weights and log
        restore_file: (string) optional- name of file to restore from (without its extension .pth.tar)
    """
    # reload weights from restore_file if specified
    if restore_file is not None:
        restore_path = os.path.join(args.model_dir,
                                    args.restore_file + '.pth.tar')
        logging.info("Restoring parameters from {}".format(restore_path))
        utils.load_checkpoint(restore_path, model, optimizer)

    best_val_acc = 0.0

    for epoch in range(params.num_epochs):
        # Run one epoch
        logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs))

        # compute number of batches in one epoch (one full pass over the training set)
        num_steps = (data_params.train_size + 1) // params.batch_size
        train_data_iterator = data_loader.data_iterator(train_data,
                                                        params,
                                                        shuffle=True)
        #print("before getting into train")
        #print(num_steps)
        #print(train_data_iterator)
        train(model, optimizer, loss_fn, train_data_iterator, metrics, params,
              num_steps)

        # Evaluate for one epoch on validation set
        num_steps = (params.val_size + 1) // params.batch_size
        #print("before getting into val")
        #print(num_steps)
        val_data_iterator = data_loader.data_iterator(val_data,
                                                      params,
                                                      shuffle=False)
        val_metrics = evaluate(model, loss_fn, val_data_iterator, metrics,
                               params, num_steps)

        val_acc = val_metrics['accuracy']
        is_best = val_acc >= best_val_acc

        # Save weights
        utils.save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'optim_dict': optimizer.state_dict()
            },
            is_best=is_best,
            checkpoint=model_dir)

        # If best_eval, best_save_path
        if is_best:
            logging.info("- Found new best accuracy")
            best_val_acc = val_acc

            # Save best val metrics in a json file in the model directory
            best_json_path = os.path.join(model_dir,
                                          "metrics_val_best_weights.json")
            utils.save_dict_to_json(val_metrics, best_json_path)

        # Save latest val metrics in a json file in the model directory
        last_json_path = os.path.join(model_dir,
                                      "metrics_val_last_weights.json")
        utils.save_dict_to_json(val_metrics, last_json_path)

        if (epoch % 100 == 0):
            plt.plot(train_loss)
            plt.savefig(str(epoch) + " epoch.jpg")
Пример #17
0
def train(train_epoch, phase='train'):
    global global_step

    lr_decay(global_step)
    print("epoch %3d with lr=%.02e" % (train_epoch, get_lr()))

    ssrn.train() if phase == 'train' else ssrn.eval()
    torch.set_grad_enabled(
        True) if phase == 'train' else torch.set_grad_enabled(False)
    data_loader = train_data_loader if phase == 'train' else valid_data_loader

    it = 0
    running_loss = 0.0
    running_l1_loss = 0.0

    pbar = tqdm(data_loader,
                unit="audios",
                unit_scale=data_loader.batch_size,
                disable=hp.disable_progress_bar)
    for batch in pbar:
        M, S = batch['mags'], batch['mels']
        M = M.permute(0, 2, 1)  # TODO: because of pre processing
        S = S.permute(0, 2, 1)  # TODO: because of pre processing

        M.requires_grad = False
        M = M.cuda()
        S = S.cuda()

        Z_logit, Z = ssrn(S)

        l1_loss = F.l1_loss(Z, M)

        loss = l1_loss

        if phase == 'train':
            lr_decay(global_step)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            global_step += 1

        it += 1

        loss = loss.item()
        l1_loss = l1_loss.item()
        running_loss += loss
        running_l1_loss += l1_loss

        if phase == 'train':
            # update the progress bar
            pbar.set_postfix({'l1': "%.05f" % (running_l1_loss / it)})
            logger.log_step(phase, global_step, {'loss_l1': l1_loss}, {
                'mags-true': M[:1, :, :],
                'mags-pred': Z[:1, :, :],
                'mels': S[:1, :, :]
            })
            if global_step % 5000 == 0:
                # checkpoint at every 5000th step
                save_checkpoint(logger.logdir, train_epoch, global_step, ssrn,
                                optimizer)

    epoch_loss = running_loss / it
    epoch_l1_loss = running_l1_loss / it

    logger.log_epoch(phase, global_step, {'loss_l1': epoch_l1_loss})

    return epoch_loss
Пример #18
0
def main():
    global best_acc

    if not os.path.isdir(args.checkpoint):
        os.makedirs(args.checkpoint)

    # data
    transformations = get_transforms(input_size=args.image_size,
                                     test_size=args.image_size)
    train_set = data_gen.Dataset(root=args.train_txt_path,
                                 transform=transformations['val_train'])
    train_loader = data.DataLoader(train_set,
                                   batch_size=args.batch_size,
                                   shuffle=True)

    val_set = data_gen.ValDataset(root=args.val_txt_path,
                                  transform=transformations['val_test'])
    val_loader = data.DataLoader(val_set,
                                 batch_size=args.batch_size,
                                 shuffle=False)

    # model
    model = make_model(args)
    if use_cuda:
        model.cuda()

    # define loss function and optimizer
    if use_cuda:
        criterion = nn.CrossEntropyLoss().cuda()
    else:
        criterion = nn.CrossEntropyLoss()

    optimizer = get_optimizer(model, args)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           mode='min',
                                                           factor=0.2,
                                                           patience=5,
                                                           verbose=False)

    # load checkpoint
    start_epoch = args.start_epoch
    # if args.resume:
    #     print("===> Resuming from checkpoint")
    #     assert os.path.isfile(args.resume),'Error: no checkpoint directory found'
    #     args.checkpoint = os.path.dirname(args.resume)  # 去掉文件名 返回目录
    #     checkpoint = torch.load(args.resume)
    #     best_acc = checkpoint['best_acc']
    #     start_epoch = checkpoint['epoch']
    #     model.module.load_state_dict(checkpoint['state_dict'])
    #     optimizer.load_state_dict(checkpoint['optimizer'])

    # train
    for epoch in range(start_epoch, args.epochs):
        print('\nEpoch: [%d | %d] LR: %f' %
              (epoch + 1, args.epochs, optimizer.param_groups[0]['lr']))

        train_loss, train_acc = train(train_loader, model, criterion,
                                      optimizer, epoch, use_cuda)
        test_loss, val_acc = val(val_loader, model, criterion, epoch, use_cuda)

        scheduler.step(test_loss)

        print(
            f'train_loss:{train_loss}\t val_loss:{test_loss}\t train_acc:{train_acc} \t val_acc:{val_acc}'
        )

        # save_model
        is_best = val_acc >= best_acc
        best_acc = max(val_acc, best_acc)

        save_checkpoint(
            {
                'fold': 0,
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'train_acc': train_acc,
                'acc': val_acc,
                'best_acc': best_acc,
                'optimizer': optimizer.state_dict(),
            },
            is_best,
            single=True,
            checkpoint=args.checkpoint)

    print("best acc = ", best_acc)
Пример #19
0
def main():
    logger.info("Logger is set - training start")

    # set default gpu device id
    torch.cuda.set_device(config.gpus[0])

    # set seed
    np.random.seed(config.seed)
    torch.manual_seed(config.seed)
    torch.cuda.manual_seed_all(config.seed)

    torch.backends.cudnn.benchmark = True

    # get data with meta info
    input_size, input_channels, n_classes, train_data, valid_data = utils.get_data(
        config.dataset,
        config.data_path,
        config.cutout_length,
        validation=True)

    criterion = nn.CrossEntropyLoss().to(device)
    use_aux = config.aux_weight > 0.0
    if config.dataset in utils.LARGE_DATASETS:
        model = AugmentCNNImageNet(
            input_size,
            input_channels,
            config.init_channels,
            n_classes,
            config.layers,
            use_aux,
            config.genotype,
        )
    else:
        model = AugmentCNN(
            input_size,
            input_channels,
            config.init_channels,
            n_classes,
            config.layers,
            use_aux,
            config.genotype,
        )
    model = nn.DataParallel(model, device_ids=config.gpus).to(device)

    # model size
    mb_params = utils.param_size(model)
    logger.info("Model size = {:.3f} MB".format(mb_params))

    # weights optimizer
    optimizer = torch.optim.SGD(
        model.parameters(),
        config.lr,
        momentum=config.momentum,
        weight_decay=config.weight_decay,
    )

    train_loader = torch.utils.data.DataLoader(
        train_data,
        batch_size=config.batch_size,
        shuffle=True,
        num_workers=config.workers,
        pin_memory=True,
    )
    valid_loader = torch.utils.data.DataLoader(
        valid_data,
        batch_size=config.batch_size,
        shuffle=False,
        num_workers=config.workers,
        pin_memory=True,
    )
    lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, config.epochs)

    best_top1 = 0.0
    # training loop
    for epoch in range(config.epochs):
        lr_scheduler.step()
        drop_prob = config.drop_path_prob * epoch / config.epochs
        model.module.drop_path_prob(drop_prob)

        # training
        train(train_loader, model, optimizer, criterion, epoch)

        # validation
        cur_step = (epoch + 1) * len(train_loader)
        top1 = validate(valid_loader, model, criterion, epoch, cur_step)

        # save
        if best_top1 < top1:
            best_top1 = top1
            is_best = True
        else:
            is_best = False
        utils.save_checkpoint(model, config.path, is_best)

        print("")

    logger.info("Final best Prec@1 = {:.4%} for job {}".format(
        best_top1, config.name))
Пример #20
0
    best_acc = 0
    for epoch in range(config['num_epochs']):
        train_loss = train(model,
                           data_loader=train_loader,
                           criterion=criterion,
                           optimizer=optimizer,
                           epoch=epoch,
                           to_log=path['log'])
        test_loss, acc = test(model,
                              test_loader=test_loader,
                              criterion=criterion,
                              to_log=path['log'])
        if acc >= best_acc:
            best_acc = acc
            save_checkpoint(model.state_dict(),
                            is_best=True,
                            checkpoint=path['dir'])
        else:
            save_checkpoint(model.state_dict(),
                            is_best=False,
                            checkpoint=path['dir'])

        lr_scheduler.step()

        metrics_dic['loss'].append(test_loss)
        metrics_dic['precision'].append(acc)

    # print best acc after training
    write_log("<<<<< Best Accuracy = {:.2f} >>>>>".format(best_acc),
              path['log'])
Пример #21
0
def main() -> None:
    global best_loss

    args = parser.parse_args()

    if args.seed is not None:
        random.seed(args.seed)
        torch.manual_seed(args.seed)
        cudnn.deterministic = True
        warnings.warn('You have chosen to seed training. '
                      'This will turn on the CUDNN deterministic setting, '
                      'which can slow down your training considerably! '
                      'You may see unexpected behavior when restarting '
                      'from checkpoints.')

    start_epoch = 0

    vcf_reader = VCFReader(args.train_data, args.classification_map,
                           args.chromosome, args.class_hierarchy)
    vcf_writer = vcf_reader.get_vcf_writer()
    train_dataset, validation_dataset = vcf_reader.get_datasets(
        args.validation_split)
    train_sampler = BatchByLabelRandomSampler(args.batch_size,
                                              train_dataset.labels)
    train_loader = DataLoader(train_dataset, batch_sampler=train_sampler)

    if args.validation_split != 0:
        validation_sampler = BatchByLabelRandomSampler(
            args.batch_size, validation_dataset.labels)
        validation_loader = DataLoader(validation_dataset,
                                       batch_sampler=validation_sampler)

    kwargs = {
        'total_size': vcf_reader.positions.shape[0],
        'window_size': args.window_size,
        'num_layers': args.layers,
        'num_classes': len(vcf_reader.label_encoder.classes_),
        'num_super_classes': len(vcf_reader.super_label_encoder.classes_)
    }
    model = WindowedMLP(**kwargs)
    model.to(get_device(args))

    optimizer = AdamW(model.parameters(), lr=args.learning_rate)

    #######
    if args.resume_path is not None:
        if os.path.isfile(args.resume_path):
            print("=> loading checkpoint '{}'".format(args.resume_path))
            checkpoint = torch.load(args.resume_path)
            if kwargs != checkpoint['model_kwargs']:
                raise ValueError(
                    'The checkpoint\'s kwargs don\'t match the ones used to initialize the model'
                )
            if vcf_reader.snps.shape[0] != checkpoint['vcf_writer'].snps.shape[
                    0]:
                raise ValueError(
                    'The data on which the checkpoint was trained had a different number of snp positions'
                )
            start_epoch = checkpoint['epoch']
            best_loss = checkpoint['best_loss']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume_path, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    #############

    if args.validate:
        validate(validation_loader, model,
                 nn.functional.binary_cross_entropy_with_logits,
                 len(vcf_reader.label_encoder.classes_),
                 len(vcf_reader.super_label_encoder.classes_), vcf_reader.maf,
                 args)
        return

    for epoch in range(start_epoch, args.epochs + start_epoch):
        loss = train(train_loader, model,
                     nn.functional.binary_cross_entropy_with_logits, optimizer,
                     len(vcf_reader.label_encoder.classes_),
                     len(vcf_reader.super_label_encoder.classes_),
                     vcf_reader.maf, epoch, args)

        if epoch % args.save_freq == 0 or epoch == args.epochs + start_epoch - 1:
            if args.validation_split != 0:
                validation_loss = validate(
                    validation_loader, model,
                    nn.functional.binary_cross_entropy_with_logits,
                    len(vcf_reader.label_encoder.classes_),
                    len(vcf_reader.super_label_encoder.classes_),
                    vcf_reader.maf, args)
                is_best = validation_loss < best_loss
                best_loss = min(validation_loss, best_loss)
            else:
                is_best = loss < best_loss
                best_loss = min(loss, best_loss)

            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'state_dict': model.state_dict(),
                    'model_kwargs': kwargs,
                    'best_loss': best_loss,
                    'optimizer': optimizer.state_dict(),
                    'vcf_writer': vcf_writer,
                    'label_encoder': vcf_reader.label_encoder,
                    'super_label_encoder': vcf_reader.super_label_encoder,
                    'maf': vcf_reader.maf
                }, is_best, args.chromosome, args.model_name, args.model_dir)
Пример #22
0
def main():
    global args, best_result, output_directory, train_csv, test_csv

    # Data loading code
    print("=> creating data loaders...")
    # valdir = os.path.join('..', 'data', args.data, 'val')
    # valdir ="/home/titan-nano/Documents/DLProject/data/rgbd/val/img"

    data_dir = '/p300/dataset'
    train_dir = os.path.join(data_dir, 'data', args.data, 'train')
    val_dir = os.path.join(data_dir, 'data', args.data, 'val')

    if args.data == 'nyudepthv2':
        from dataloaders.nyu import NYUDataset
        train_dataset = NYUDataset(train_dir,
                                   split='train',
                                   modality=args.modality)
        val_dataset = NYUDataset(train_dir,
                                 split='val',
                                 modality=args.modality)
    elif args.data == 'rgbd':
        from dataloaders.sist import RGBDDataset
        train_dataset = RGBDDataset(train_dir,
                                    split='train',
                                    modality=args.modality)
        val_dataset = RGBDDataset(val_dir, split='val', modality=args.modality)
    else:
        raise RuntimeError('Dataset not found.')

    # set batch size to be 1 for validation
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=1,
                                               shuffle=False,
                                               num_workers=args.workers,
                                               pin_memory=True)
    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=1,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True)
    print("=> data loaders created.")

    ############################## Resume Mode ##############################
    # loading pretrained model
    print("=> loading model '{}'".format(args.evaluate))
    args.start_epoch = 0
    checkpoint = torch.load(args.evaluate)
    if type(checkpoint) is dict:
        # loading pretrained model
        model = checkpoint['model']
        print("=> loaded best model (epoch {})".format(checkpoint['epoch']))
    else:
        model = checkpoint

    ############################## Training Setting ##############################
    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # define loss function (criterion) and optimizer
    criterion = None
    if args.criterion == 'l2':
        criterion = criteria.MaskedMSELoss().cuda()
    elif args.criterion == 'l1':
        criterion = criteria.MaskedL1Loss().cuda()

    output_directory = os.path.dirname(args.evaluate)
    best_txt = os.path.join(output_directory, 'best.txt')

    ############################## Training ##############################
    for epoch in range(args.epochs):
        utils.adjust_learning_rate(optimizer, epoch, args.lr)
        train(train_loader, model, criterion, optimizer, epoch)
        result, img_merge = validate(val_loader, model, epoch)

        # remember best rmse and save checkpoint
        is_best = result.rmse < best_result.rmse
        if is_best:
            best_result = result
            best_model = model
            with open(best_txt, 'w') as txtfile:
                txtfile.write(
                    "epoch={}\nmse={:.3f}\nrmse={:.3f}\nabsrel={:.3f}\nlg10={:.3f}\nmae={:.3f}\ndelta1={:.3f}\nt_gpu={:.4f}\n"
                    .format(epoch, result.mse, result.rmse, result.absrel,
                            result.lg10, result.mae, result.delta1,
                            result.gpu_time))
            if img_merge is not None:
                img_filename = output_directory + '/comparison_best.png'
                utils.save_image(img_merge, img_filename)

        utils.save_checkpoint(
            {
                'args': args,
                'epoch': epoch,
                'arch': args.arch,
                'model': model,
                'best_result': best_result,
                'optimizer': optimizer,
            }, is_best, epoch, output_directory)

    # save loss file
    loss_file = np.array(history_loss)
    np.savetxt(output_directory + '/loss.txt', loss_file)

    torch.save(best_model.state_dict(), output_directory + '/best_model.pkl')
Пример #23
0
def train_net(args):
    torch.manual_seed(7)
    np.random.seed(7)
    checkpoint = args.checkpoint
    start_epoch = 0
    best_acc = float('-inf')
    writer = SummaryWriter()
    epochs_since_improvement = 0

    # Initialize / load checkpoint
    if checkpoint is None:
        model = MobileFaceNet()
        metric_fc = ArcMarginModel(args)

        optimizer = torch.optim.SGD([{
            'params': model.conv1.parameters()
        }, {
            'params': model.dw_conv.parameters()
        }, {
            'params': model.features.parameters()
        }, {
            'params': model.conv2.parameters()
        }, {
            'params': model.gdconv.parameters()
        }, {
            'params': model.conv3.parameters(),
            'weight_decay': 4e-4
        }, {
            'params': model.bn.parameters()
        }, {
            'params': metric_fc.parameters()
        }],
                                    lr=args.lr,
                                    momentum=args.mom,
                                    weight_decay=args.weight_decay,
                                    nesterov=True)

        model = nn.DataParallel(model)
        metric_fc = nn.DataParallel(metric_fc)

    else:
        checkpoint = torch.load(checkpoint)
        start_epoch = checkpoint['epoch'] + 1
        epochs_since_improvement = checkpoint['epochs_since_improvement']
        model = checkpoint['model']
        metric_fc = checkpoint['metric_fc']
        optimizer = checkpoint['optimizer']

    logger = get_logger()

    # Move to GPU, if available
    model = model.to(device)
    metric_fc = metric_fc.to(device)

    # Loss function
    if args.focal_loss:
        criterion = FocalLoss(gamma=args.gamma).to(device)
    else:
        criterion = nn.CrossEntropyLoss().to(device)

    # Custom dataloaders
    train_dataset = ArcFaceDataset('train')
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=4)

    scheduler = MultiStepLR(optimizer, milestones=[5, 10, 15, 20], gamma=0.1)

    # Epochs
    for epoch in range(start_epoch, args.end_epoch):
        # One epoch's training
        train_loss, train_acc = train(train_loader=train_loader,
                                      model=model,
                                      metric_fc=metric_fc,
                                      criterion=criterion,
                                      optimizer=optimizer,
                                      epoch=epoch,
                                      logger=logger)

        lr = optimizer.param_groups[0]['lr']
        print('\nLearning rate={}\n'.format(lr))

        writer.add_scalar('model/train_loss', train_loss, epoch)
        writer.add_scalar('model/train_acc', train_acc, epoch)
        writer.add_scalar('model/learning_rate', lr, epoch)

        # One epoch's validation
        megaface_acc = megaface_test(model)
        writer.add_scalar('model/megaface_accuracy', megaface_acc, epoch)

        # Check if there was an improvement
        is_best = megaface_acc > best_acc
        best_acc = max(megaface_acc, best_acc)
        if not is_best:
            epochs_since_improvement += 1
            print("\nEpochs since last improvement: %d\n" %
                  (epochs_since_improvement, ))
        else:
            epochs_since_improvement = 0

        # Save checkpoint
        save_checkpoint(epoch, epochs_since_improvement, model, metric_fc,
                        optimizer, best_acc, is_best)
        scheduler.step(epoch)
Пример #24
0
def train_net(args):
    torch.manual_seed(7)
    np.random.seed(7)
    checkpoint = args.checkpoint
    start_epoch = 0
    best_acc = 0
    writer = SummaryWriter()
    epochs_since_improvement = 0

    # Initialize / load checkpoint
    if checkpoint is None:
        if args.network == 'r18':
            model = resnet18(args)
        elif args.network == 'r34':
            model = resnet34(args)
        elif args.network == 'r50':
            model = resnet50(args)
        elif args.network == 'r101':
            model = resnet101(args)
        elif args.network == 'r152':
            model = resnet152(args)
        elif args.network == 'mobile':
            model = MobileNet(1.0)
        else:
            model = resnet_face18(args.use_se)
        model = nn.DataParallel(model)
        metric_fc = ArcMarginModel(args)
        metric_fc = nn.DataParallel(metric_fc)

        if args.optimizer == 'sgd':
            optimizer = torch.optim.SGD([{
                'params': model.parameters()
            }, {
                'params': metric_fc.parameters()
            }],
                                        lr=args.lr,
                                        momentum=args.mom,
                                        weight_decay=args.weight_decay)
        else:
            optimizer = torch.optim.Adam([{
                'params': model.parameters()
            }, {
                'params': metric_fc.parameters()
            }],
                                         lr=args.lr,
                                         weight_decay=args.weight_decay)

    else:
        checkpoint = torch.load(checkpoint)
        start_epoch = checkpoint['epoch'] + 1
        epochs_since_improvement = checkpoint['epochs_since_improvement']
        model = checkpoint['model']
        metric_fc = checkpoint['metric_fc']
        optimizer = checkpoint['optimizer']

    logger = get_logger()

    # Move to GPU, if available
    model = model.to(device)
    metric_fc = metric_fc.to(device)

    # Loss function
    if args.focal_loss:
        criterion = FocalLoss(gamma=args.gamma).to(device)
    else:
        criterion = nn.CrossEntropyLoss().to(device)

    # Custom dataloaders
    train_dataset = ArcFaceDataset('train')
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=True)

    scheduler = StepLR(optimizer, step_size=args.lr_step, gamma=0.1)

    # Epochs
    for epoch in range(start_epoch, args.end_epoch):
        scheduler.step()

        if args.full_log:
            lfw_acc, threshold = lfw_test(model)
            writer.add_scalar('LFW_Accuracy', lfw_acc, epoch)
            full_log(epoch)

        start = datetime.now()
        # One epoch's training
        train_loss, train_top5_accs = train(train_loader=train_loader,
                                            model=model,
                                            metric_fc=metric_fc,
                                            criterion=criterion,
                                            optimizer=optimizer,
                                            epoch=epoch,
                                            logger=logger)

        writer.add_scalar('Train_Loss', train_loss, epoch)
        writer.add_scalar('Train_Top5_Accuracy', train_top5_accs, epoch)

        end = datetime.now()
        delta = end - start
        print('{} seconds'.format(delta.seconds))

        # One epoch's validation
        lfw_acc, threshold = lfw_test(model)
        writer.add_scalar('LFW_Accuracy', lfw_acc, epoch)

        # Check if there was an improvement
        is_best = lfw_acc > best_acc
        best_acc = max(lfw_acc, best_acc)
        if not is_best:
            epochs_since_improvement += 1
            print("\nEpochs since last improvement: %d\n" %
                  (epochs_since_improvement, ))
        else:
            epochs_since_improvement = 0

        # Save checkpoint
        save_checkpoint(epoch, epochs_since_improvement, model, metric_fc,
                        optimizer, best_acc, is_best)
Пример #25
0
def train(model, loss_fn, optimizer, history, trainset, valset, config):
    """ Trains the model by optimizing with respect to the given loss
        function using the given optimizer.

        Args:
            model: torch.nn.Module 
                Defines the model.
            loss_fn: torch.nn.Module 
                Defines the loss function.
            optimizer: torch.optim.optimizer 
                Defines the optimizer.
            history: dict
                Contains histories of desired run metrics.
            trainset: torch.utils.data.Dataset 
                Contains the training data.
            valset: torch.utils.data.Dataset 
                Contains the validation data.
            config: dict
                Configures the training loop. Contains the following keys:

                batch_size: int
                    The number of examples to process per batch.
                    Default value is 64.
                start_epoch: int
                    The epoch to start on for training.
                    Default value is 1.
                num_epochs: int
                    How many epochs to train the model.
                    Default value is 20.
                log_every: int
                    How often to save model checkpoint. 
                    To turn off logging, set this value to 0.
                    Default value is 5.
                plot_every: int
                    How often to generate plots.
                    To turn off plotting, set this value to 0.
                    Default value is 5.
                num_workers: int
                    How many works to assign to the DataLoader.
                    Default value is 4.
                verbose: boolean
                    Whether or not to print results to console during training.
                    Progress bar is still included. Default value is False.

        Returns:
            model: a torch.nn.Module defining the trained model.
    """

    # Get keyword parameter values
    batch_size = config.get("batch_size", 20)
    start_epoch = config.get("start_epoch", 1)
    num_epochs = config.get("num_epochs", 20)
    log_every = config.get("log_every", 5)
    plot_every = config.get("plot_every", 5)
    num_workers = config.get("num_workers", 4)
    checkpoint_dir = config.get("checkpoint_dir", "checkpoints")
    verbose = config.get("verbose", False)
    gamma = config.get("gamma", 0.1)

    # Learning rate scheduler
    scheduler = ExponentialLR(optimizer, gamma=gamma)

    # Use the f1 score to determine best checkpoint
    best_val_f1 = 0

    # Training loop
    for epoch in tqdm(range(start_epoch, num_epochs + 1),
                      desc="Epochs",
                      position=0):

        # Process training dataset
        model, train_results, train_cm = \
            process_batches(model, trainset, loss_fn, batch_size, num_workers,
                            desc="train", optimizer=optimizer, is_training=True)
        if verbose:
            tqdm.write(
                PROGRESS_MSG.format(train_results["accuracy"],
                                    train_results["precision"],
                                    train_results["recall"],
                                    train_results["f1"]))

        # Process validation dataset
        val_results, val_cm = \
            process_batches(model, valset, loss_fn, batch_size, num_workers,
                            desc="val", optimizer=optimizer, is_training=False)
        if verbose:
            tqdm.write(
                PROGRESS_MSG.format(val_results["accuracy"],
                                    val_results["precision"],
                                    val_results["recall"], val_results["f1"]))

        # Take step for LR
        scheduler.step()

        # Update run history
        for name, val in train_results.items():
            history["train_{}".format(name)].append(val)
        for name, val in val_results.items():
            history["val_{}".format(name)].append(val)

        # Update best checkpoint
        if val_results["f1"] > best_val_f1:
            if verbose:
                tqdm.write("New best checkpoint!")
            best_val_f1 = val_results["f1"]
            filepath = os.path.join(checkpoint_dir, "best_checkpoint")
            save_checkpoint(model, optimizer, history, epoch + 1, filepath)

        # Save checkpoint
        if log_every != 0 and epoch % log_every == 0:
            if verbose:
                tqdm.write("Saving checkpoint...")
            filename = "checkpoint_epoch_{}".format(epoch)
            filepath = os.path.join(checkpoint_dir, filename)
            save_checkpoint(model, optimizer, history, epoch + 1, filepath)

        # Generate plots
        if plot_every != 0 and epoch % plot_every == 0:
            if verbose:
                tqdm.write("Generating plots...")
            generate_plots(history, checkpoint_dir)

    return model
Пример #26
0
def train(train_epoch, phase='train'):
    global global_step

    lr_decay(global_step)
    print("epoch %3d with lr=%.02e" % (train_epoch, get_lr()))

    text2mel.train() if phase == 'train' else text2mel.eval()
    torch.set_grad_enabled(
        True) if phase == 'train' else torch.set_grad_enabled(False)
    data_loader = train_data_loader if phase == 'train' else valid_data_loader

    it = 0
    running_loss = 0.0
    running_l1_loss = 0.0
    running_att_loss = 0.0

    pbar = tqdm(data_loader,
                unit="audios",
                unit_scale=data_loader.batch_size,
                disable=hp.disable_progress_bar)
    for batch in pbar:
        L, S, gates = batch['texts'], batch['mels'], batch['mel_gates']
        S = S.permute(0, 2, 1)  # TODO: because of pre processing

        B, N = L.size()  # batch size and text count
        _, n_mels, T = S.size()  # number of melspectrogram bins and time

        assert gates.size(0) == B  # TODO: later remove
        assert gates.size(1) == T

        S_shifted = torch.cat((S[:, :, 1:], torch.zeros(B, n_mels, 1)), 2)

        S.requires_grad = False
        S_shifted.requires_grad = False
        gates.requires_grad = False

        def W_nt(_, n, t, g=0.2):
            return 1.0 - np.exp(-((n / float(N) - t / float(T))**2) /
                                (2 * g**2))

        W = np.fromfunction(W_nt, (B, N, T), dtype=np.float32)
        W = torch.from_numpy(W)

        L = L.cuda()
        S = S.cuda()
        S_shifted = S_shifted.cuda()
        W = W.cuda()
        gates = gates.cuda()

        Y_logit, Y, A = text2mel(L, S)

        l1_loss = F.l1_loss(Y, S_shifted)
        masks = gates.reshape(B, 1, T).float()
        att_loss = (A * W * masks).mean()

        loss = l1_loss + att_loss

        if phase == 'train':
            lr_decay(global_step)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            global_step += 1

        it += 1

        loss, l1_loss, att_loss = loss.item(), l1_loss.item(), att_loss.item()
        running_loss += loss
        running_l1_loss += l1_loss
        running_att_loss += att_loss

        if phase == 'train':
            # update the progress bar
            pbar.set_postfix({
                'l1': "%.05f" % (running_l1_loss / it),
                'att': "%.05f" % (running_att_loss / it)
            })
            logger.log_step(phase, global_step, {
                'loss_l1': l1_loss,
                'loss_att': att_loss
            }, {
                'mels-true': S[:1, :, :],
                'mels-pred': Y[:1, :, :],
                'attention': A[:1, :, :]
            })
            if global_step % 5000 == 0:
                # checkpoint at every 5000th step
                save_checkpoint(logger.logdir, train_epoch, global_step,
                                text2mel, optimizer)

    epoch_loss = running_loss / it
    epoch_l1_loss = running_l1_loss / it
    epoch_att_loss = running_att_loss / it

    logger.log_epoch(phase, global_step, {
        'loss_l1': epoch_l1_loss,
        'loss_att': epoch_att_loss
    })

    return epoch_loss
Пример #27
0
def main():
    global n_iter
    args = parser.parse_args()
    save_path = save_path_formatter(args, parser)
    args.save_path = 'checkpoints' / save_path
    print('=> will save everything to {}'.format(args.save_path))
    args.save_path.makedirs_p()
    torch.manual_seed(args.seed)

    training_writer = SummaryWriter(args.save_path)
    output_writers = []
    if args.log_output:
        for i in range(3):
            output_writers.append(
                SummaryWriter(args.save_path / 'valid' / str(i)))

    # Data loading code
    normalize = custom_transforms.Normalize(mean=[0.5, 0.5, 0.5],
                                            std=[0.5, 0.5, 0.5])
    train_transform = custom_transforms.Compose([
        custom_transforms.RandomScaleCrop(),
        custom_transforms.ArrayToTensor(), normalize
    ])

    valid_transform = custom_transforms.Compose(
        [custom_transforms.ArrayToTensor(), normalize])

    print("=> fetching scenes in '{}'".format(args.data))
    train_set = SequenceFolder(args.data,
                               transform=train_transform,
                               seed=args.seed,
                               ttype=args.ttype)
    val_set = SequenceFolder(args.data,
                             transform=valid_transform,
                             seed=args.seed,
                             ttype=args.ttype2)

    print('{} samples found in {} train scenes'.format(len(train_set),
                                                       len(train_set.scenes)))
    print('{} samples found in {} valid scenes'.format(len(val_set),
                                                       len(val_set.scenes)))
    train_loader = torch.utils.data.DataLoader(train_set,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=args.workers,
                                               pin_memory=True)
    val_loader = torch.utils.data.DataLoader(val_set,
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True)

    if args.epoch_size == 0:
        args.epoch_size = len(train_loader)

    # create model
    print("=> creating model")

    dpsnet = PSNet(args.nlabel, args.mindepth).cuda()

    if args.pretrained_dps:
        print("=> using pre-trained weights for DPSNet")
        weights = torch.load(args.pretrained_dps)
        dpsnet.load_state_dict(weights['state_dict'])
    else:
        dpsnet.init_weights()

    cudnn.benchmark = True
    dpsnet = torch.nn.DataParallel(dpsnet)

    print('=> setting adam solver')

    parameters = chain(dpsnet.parameters())
    optimizer = torch.optim.Adam(parameters,
                                 args.lr,
                                 betas=(args.momentum, args.beta),
                                 weight_decay=args.weight_decay)

    with open(args.save_path / args.log_summary, 'w') as csvfile:
        writer = csv.writer(csvfile, delimiter='\t')
        writer.writerow(['train_loss', 'validation_loss'])

    with open(args.save_path / args.log_full, 'w') as csvfile:
        writer = csv.writer(csvfile, delimiter='\t')
        writer.writerow(['train_loss'])

    for epoch in range(args.epochs):
        adjust_learning_rate(args, optimizer, epoch)

        # train for one epoch
        train_loss = train(args, train_loader, dpsnet, optimizer,
                           args.epoch_size, training_writer)
        errors, error_names = validate_with_gt(args, val_loader, dpsnet, epoch,
                                               output_writers)

        error_string = ', '.join('{} : {:.3f}'.format(name, error)
                                 for name, error in zip(error_names, errors))

        for error, name in zip(errors, error_names):
            training_writer.add_scalar(name, error, epoch)

        # Up to you to chose the most relevant error to measure your model's performance, careful some measures are to maximize (such as a1,a2,a3)
        decisive_error = errors[0]
        save_checkpoint(args.save_path, {
            'epoch': epoch + 1,
            'state_dict': dpsnet.module.state_dict()
        }, epoch)

        with open(args.save_path / args.log_summary, 'a') as csvfile:
            writer = csv.writer(csvfile, delimiter='\t')
            writer.writerow([train_loss, decisive_error])
def run(args):
    start_epoch = 1
    best_loss = 1e+9

    # logs
    args.logdir = get_logdir(args)
    logger = get_logger(os.path.join(args.logdir, 'main.log'))
    logger.info(args)
    writer = SummaryWriter(args.logdir)

    # data
    train_set = MovingMNIST(root='./data', train=True, download=True)
    valid_set = MovingMNIST(root='./data',
                            train=False,
                            download=True,
                            split=args.test_size)
    train_loader = DataLoader(train_set,
                              batch_size=args.batch_size,
                              num_workers=args.n_workers,
                              shuffle=True)
    valid_loader = DataLoader(valid_set,
                              batch_size=args.batch_size,
                              num_workers=args.n_workers,
                              shuffle=False)

    # network
    model = models.__dict__[args.model](args=args)
    model = nn.DataParallel(model)
    args.device = torch.device(
        'cuda:0' if torch.cuda.is_available() else 'cpu')
    model = model.to(args.device)
    # training
    criterion = get_loss_fn(args)
    optimizer = get_optimizer(model, args)
    scheduler = get_scheduler(optimizer, args)

    if args.resume:
        if os.path.isfile(args.resume):
            checkpoint = torch.load(args.resume)
            start_epoch = checkpoint['epoch'] + 1
            best_loss = checkpoint['best/{}'.format(args.loss)]
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            logger.info('Loaded checkpoint {} (epoch {})'.format(
                args.resume, start_epoch - 1))
        else:
            raise IOError('No such file {}'.format(args.resume))

    for epoch_i in range(start_epoch, args.epochs + 1):
        training = train(train_loader,
                         model,
                         criterion,
                         optimizer,
                         logger=logger,
                         args=args)
        validation = validate(valid_loader,
                              model,
                              criterion,
                              logger=logger,
                              args=args)

        writer.add_scalar('Train/{}'.format(args.loss), training[args.loss],
                          epoch_i)
        writer.add_scalar('Valid/{}'.format(args.loss), validation[args.loss],
                          epoch_i)
        writer.add_image('Train/Predict', _get_images(training['output'],
                                                      args), epoch_i)
        writer.add_image('Train/Target', _get_images(training['target'], args),
                         epoch_i)
        writer.add_image('Valid/Predict',
                         _get_images(validation['output'], args), epoch_i)
        writer.add_image('Valid/Target', _get_images(validation['target'],
                                                     args), epoch_i)

        message = '[{}] Epoch {} Train/{} {:.4f} Valid/{} {:.4f} '
        message = message.format(
            args.expid,
            epoch_i,
            args.loss,
            training[args.loss],
            args.loss,
            validation[args.loss],
        )

        is_best = validation[args.loss] < best_loss
        if is_best:
            best_loss = validation[args.loss]
            message += '(Best)'
        save_checkpoint(
            {
                'epoch': epoch_i,
                'state_dict': model.state_dict(),
                'valid/{}'.format(args.loss): validation[args.loss],
                'best/{}'.format(args.loss): best_loss,
                'optimizer': optimizer.state_dict(),
            }, is_best, args.logdir)

        if scheduler is not None:
            scheduler.step(epoch=epoch_i)
            logger.debug('Scheduler stepped.')
            for param_group in optimizer.param_groups:
                logger.debug(param_group['lr'])

        logger.info(message)
Пример #29
0
def main(conf):

    warnings.filterwarnings("ignore")
    best_score = 0.
    val_score = 0
    val_loss = 0
    epoch_start = 0

    # dataloader
    train_loader, val_loader = get_dataloader(conf)

    # model
    model = networks.get_model(conf)
    model = nn.DataParallel(model).cuda()

    if conf.weightfile is not None:
        wmodel = networks.get_model(conf)
        wmodel = nn.DataParallel(wmodel).cuda()
        checkpoint_dict = load_checkpoint(wmodel, conf.weightfile)
        if 'best_score' in checkpoint_dict:
            print('best score: {}'.format(best_score))
    else:
        wmodel = model

    # training setting
    criterion, optimizer, scheduler = get_train_setting(model, conf)

    # training and evaluate process for each epoch
    train, validate = get_proc(conf)

    if conf.resume:
        checkpoint_dict = load_checkpoint(model, conf.resume)
        epoch_start = checkpoint_dict['epoch']
        if 'best_score' in checkpoint_dict:
            best_score = checkpoint_dict['best_score']
            print('best score: {}'.format(best_score))
        print('Resuming training process from epoch {}...'.format(epoch_start))
        optimizer.load_state_dict(checkpoint_dict['optimizer'])
        scheduler.load_state_dict(checkpoint_dict['scheduler'])
        print('Resuming lr scheduler')
        print(checkpoint_dict['scheduler'])

    if conf.evaluate:
        print(validate(val_loader, model, criterion, conf))
        return

    detach_epoch = conf.epochs + 1
    if 'detach_epoch' in conf:
        detach_epoch = conf.detach_epoch

    start_eval = 0
    if 'start_eval' in conf:
        start_eval = conf.start_eval

    ## ------main loop-----
    for epoch in range(epoch_start, conf.epochs):
        lr = optimizer.param_groups[0]['lr']
        logging.info("Epoch: [{} | {} LR: {}".format(epoch + 1, conf.epochs,
                                                     lr))

        if epoch == detach_epoch:
            model.module.set_detach(False)

        tmp_loss = train(train_loader, model, criterion, optimizer, conf,
                         wmodel)
        infostr = {'Epoch:  {}   train_loss: {}'.format(epoch + 1, tmp_loss)}
        logging.info(infostr)
        scheduler.step()

        if epoch > start_eval:
            with torch.no_grad():
                val_score, val_loss, mscore, ascore = validate(
                    val_loader, model, criterion, conf)
                comscore = val_score
                if 'midlevel' in conf:
                    if conf.midlevel:
                        comscore = ascore
                is_best = comscore > best_score
                best_score = max(comscore, best_score)
                infostr = {
                    'Epoch:  {:.4f}   loss: {:.4f},gs: {:.4f},bs:{:.4f} ,ms:{:.4f},as:{:.4f}'
                    .format(epoch + 1, val_loss, val_score, best_score, mscore,
                            ascore)
                }
                logging.info(infostr)
                save_checkpoint(
                    {
                        'epoch': epoch + 1,
                        'state_dict': model.module.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'scheduler': scheduler.state_dict(),
                        'best_score': best_score
                    },
                    is_best,
                    outdir=conf['outdir'])

    print('Best val acc: {}'.format(best_score))
    return 0
Пример #30
0
def train_and_evaluate_kd(model,
                          teacher_model,
                          train_dataloader,
                          val_dataloader,
                          optimizer,
                          loss_fn_kd,
                          metrics,
                          params,
                          model_dir,
                          restore_file=None):
    """Train the model and evaluate every epoch.

    Args:
        model: (torch.nn.Module) the neural network
        params: (Params) hyperparameters
        model_dir: (string) directory containing config, weights and log
        restore_file: (string) - file to restore (without its extension .pth.tar)
    """
    # reload weights from restore_file if specified
    if restore_file is not None:
        restore_path = os.path.join(args.model_dir,
                                    args.restore_file + '.pth.tar')
        logging.info("Restoring parameters from {}".format(restore_path))
        utils.load_checkpoint(restore_path, model, optimizer)

    best_val_acc = 0.0

    # Tensorboard logger setup
    # board_logger = utils.Board_Logger(os.path.join(model_dir, 'board_logs'))

    # fetch teacher outputs using teacher_model under eval() mode
    loading_start = time.time()
    teacher_model.eval()
    teacher_outputs = fetch_teacher_outputs(teacher_model, train_dataloader,
                                            params)
    elapsed_time = math.ceil(time.time() - loading_start)
    logging.info("- Finished computing teacher outputs after {} secs..".format(
        elapsed_time))

    # learning rate schedulers for different models:
    if params.model_version == "resnet18_distill":
        scheduler = StepLR(optimizer, step_size=150, gamma=0.1)
    # for cnn models, num_epoch is always < 100, so it's intentionally not using scheduler here
    elif params.model_version == "cnn_distill":
        scheduler = StepLR(optimizer, step_size=100, gamma=0.2)

    for epoch in range(params.num_epochs):

        scheduler.step()

        # Run one epoch
        logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs))

        # compute number of batches in one epoch (one full pass over the training set)
        train_kd(model, teacher_outputs, optimizer, loss_fn_kd,
                 train_dataloader, metrics, params)

        # Evaluate for one epoch on validation set
        val_metrics = evaluate_kd(model, val_dataloader, metrics, params)

        val_acc = val_metrics['accuracy']
        is_best = val_acc >= best_val_acc

        # Save weights
        utils.save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'optim_dict': optimizer.state_dict()
            },
            is_best=is_best,
            checkpoint=model_dir)

        # If best_eval, best_save_path
        if is_best:
            logging.info("- Found new best accuracy")
            best_val_acc = val_acc

            # Save best val metrics in a json file in the model directory
            best_json_path = os.path.join(model_dir,
                                          "metrics_val_best_weights.json")
            utils.save_dict_to_json(val_metrics, best_json_path)

        # Save latest val metrics in a json file in the model directory
        last_json_path = os.path.join(model_dir,
                                      "metrics_val_last_weights.json")
        utils.save_dict_to_json(val_metrics, last_json_path)
Пример #31
0
def main():
    # parse arg and start experiment
    global args
    best_ap = -1.
    best_iter = 0

    args = parser.parse_args()
    args.config_of_data = config.datasets[args.data]
    # args.num_classes = config.datasets[args.data]['num_classes']
    if configure is None:
        args.tensorboard = False
        print(Fore.RED +
              'WARNING: you don\'t have tesnorboard_logger installed' +
              Fore.RESET)

    # optionally resume from a checkpoint
    if args.resume:
        if args.resume and os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            old_args = checkpoint['args']
            print('Old args:')
            print(old_args)
            # set args based on checkpoint
            if args.start_iter <= 0:
                args.start_iter = checkpoint['iter'] + 1
            best_iter = args.start_iter - 1
            best_ap = checkpoint['best_ap']
            for name in arch_resume_names:
                if name in vars(args) and name in vars(old_args):
                    setattr(args, name, getattr(old_args, name))
            model = get_model(**vars(args))
            model.load_state_dict(checkpoint['state_dict'])
            print("=> loaded checkpoint '{}' (iter {})"
                  .format(args.resume, checkpoint['iter']))
        else:
            print(
                "=> no checkpoint found at '{}'".format(
                    Fore.RED +
                    args.resume +
                    Fore.RESET),
                file=sys.stderr)
            return
    else:
        # create model
        print("=> creating model '{}'".format(args.arch))
        model = get_model(**vars(args))

    # cudnn.benchmark = True
    cudnn.enabled = False

    # create dataloader
    if args.evaluate == 'val':
        train_loader, val_loader, test_loader = getDataloaders(
            splits=('val'), **vars(args))
        validate(val_loader, model, best_iter)
        return
    elif args.evaluate == 'test':
        train_loader, val_loader, test_loader = getDataloaders(
            splits=('test'), **vars(args))
        validate(test_loader, model, best_iter)
        return
    else:
        train_loader, val_loader, test_loader = getDataloaders(
            splits=('train', 'val'), **vars(args))

    # define optimizer
    optimizer = get_optimizer(model, args)

    # check if the folder exists
    if os.path.exists(args.save):
        print(Fore.RED + args.save + Fore.RESET
              + ' already exists!', file=sys.stderr)
        if not args.force:
            ans = input('Do you want to overwrite it? [y/N]:')
            if ans not in ('y', 'Y', 'yes', 'Yes'):
                os.exit(1)
        print('remove existing ' + args.save)
        shutil.rmtree(args.save)
    os.makedirs(args.save)
    print('create folder: ' + Fore.GREEN + args.save + Fore.RESET)

    # copy code to save folder
    if args.save.find('debug') < 0:
        shutil.copytree(
            '.',
            os.path.join(
                args.save,
                'src'),
            symlinks=True,
            ignore=shutil.ignore_patterns(
                '*.pyc',
                '__pycache__',
                '*.path.tar',
                '*.pth',
                '*.ipynb',
                '.*',
                'data',
                'save',
                'save_backup'))

    # set up logging
    global log_print, f_log
    f_log = open(os.path.join(args.save, 'log.txt'), 'w')

    def log_print(*args):
        print(*args)
        print(*args, file=f_log)
    log_print('args:')
    log_print(args)
    print('model:', file=f_log)
    print(model, file=f_log, flush=True)
    # log_print('model:')
    # log_print(model)
    # log_print('optimizer:')
    # log_print(vars(optimizer))
    log_print('# of params:',
              str(sum([p.numel() for p in model.parameters()])))
    torch.save(args, os.path.join(args.save, 'args.pth'))
    scores = ['iter\tlr\ttrain_loss\tval_ap']
    if args.tensorboard:
        configure(args.save, flush_secs=5)

    for i in range(args.start_iter, args.niters + 1, args.eval_freq):
        # print('iter {:3d} lr = {:.6e}'.format(i, lr))
        # if args.tensorboard:
        #     log_value('lr', lr, i)

        # train for args.eval_freq iterations
        train_loss = train(train_loader, model, optimizer,
                           i, args.eval_freq)
        i += args.eval_freq - 1

        # evaluate on validation set
        val_ap = validate(val_loader, model, i)

        # save scores to a tsv file, rewrite the whole file to prevent
        # accidental deletion
        scores.append(('{}\t{}' + '\t{:.4f}' * 2)
                      .format(i, lr, train_loss, val_ap))
        with open(os.path.join(args.save, 'scores.tsv'), 'w') as f:
            print('\n'.join(scores), file=f)

        # remember best err@1 and save checkpoint
        # TODO: change this
        is_best = val_ap > best_ap
        if is_best:
            best_ap = val_ap
            best_iter = i
            print(Fore.GREEN + 'Best var_err1 {}'.format(best_ap) +
                  Fore.RESET)
        save_checkpoint({
            'args': args,
            'iter': i,
            'best_iter': best_iter,
            'arch': args.arch,
            'state_dict': model.state_dict(),
            'best_ap': best_ap,
        }, is_best, args.save)
        if not is_best and i - best_iter >= args.patience > 0:
            break
    print('Best val_ap: {:.4f} at iter {}'.format(best_ap, best_iter))
Пример #32
0
    filepath = args.model_dir + 'val_best_weights.json'
    if os.path.exists(filepath):
        f = open(filepath)
        data = json.load(f)
        best_val_acc = data['accuracy']
        f.close()

for epoch in range(args.max_epochs):
    train(train_set, train_set2, model, args, 'train')
    val_acc = val(val_set, val_set2, model, args, 'val')
    val_metrics = {'accuracy': val_acc}
    is_best = val_acc >= best_val_acc

    utils.save_checkpoint({'epoch': epoch + 1,
                           'state_dict': model.state_dict(),
                           'optim_dict': optimizer.state_dict()}, is_best=is_best, checkpoint=args.model_dir)

    if is_best:
        logging.info('- Found new best accuracy')
        counter = 0  # reset counter
        best_val_acc = val_acc

        best_json_path = os.path.join(
            args.model_dir, 'val_best_weights.json')
        utils.save_dict_to_json(val_metrics, best_json_path)
    else:
        counter += 1

    if counter > patience:
        logging.info('- No improvement in a while, stopping training...')
Пример #33
0
        train()
        if 't0' in optimizer.param_groups[0]:
            tmp = {}
            for prm in model.parameters():
                tmp[prm] = prm.data.clone()
                prm.data = optimizer.state[prm]['ax'].clone()

            val_loss2 = evaluate(val_data)
            logging('-' * 89)
            logging('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                    'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                               val_loss2, math.exp(val_loss2)))
            logging('-' * 89)

            if val_loss2 < stored_loss:
                save_checkpoint(model, optimizer, args.save)
                logging('Saving Averaged!')
                stored_loss = val_loss2

            for prm in model.parameters():
                prm.data = tmp[prm].clone()

        else:
            val_loss = evaluate(val_data, eval_batch_size)
            logging('-' * 89)
            logging('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                    'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                               val_loss, math.exp(val_loss)))
            logging('-' * 89)

            if val_loss < stored_loss:
Пример #34
0
def train_model(model,
                dataloaders,
                criterion,
                optimizer,
                start_epoch,
                num_epochs=args.epochs):
    '''
    Train model
    model: Model
    dataloaders: dataloader dict: {train: , val: }
    criterion: Loss function
    optimizer: Optimizer for training
    num_epochs: Number of epochs to train

    Out: Best model, val_acc_history
    '''
    since = time.time()
    val_acc_history = []
    lr = args.lr
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    learning_rate_decay_start = args.lr_decay_start
    learning_rate_decay_every = args.lr_decay_every
    learning_rate_decay_rate = args.lr_decay_rate
    for epoch in range(start_epoch, num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print("-" * 10)
        if epoch > learning_rate_decay_start and learning_rate_decay_every > 0:
            frac = (epoch -
                    learning_rate_decay_start) // learning_rate_decay_every
            decay_factor = learning_rate_decay_rate**frac
            current_lr = lr * decay_factor
            set_lr(optimizer, current_lr)
            print("Learning rate: ", current_lr)
        for phase in ["train", "val"]:
            if phase == "train":
                model.train()
            else:
                model.eval()

            running_loss = 0.0
            running_corrects = 0
            for inputs, labels in dataloaders[phase]:
                t = inputs.size(0)
                if phase == "val":
                    bs, ncrops, c, h, w = np.shape(inputs)
                    inputs = inputs.view(-1, c, h, w)  #(bs*n_crops, c, h, w)
                inputs = inputs.to(device)
                labels = labels.to(device)

                optimizer.zero_grad()
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    if phase == "val":
                        outputs = outputs.view(bs, ncrops, -1).mean(1)
                    loss = criterion(outputs, labels)

                    _, preds = torch.max(outputs, 1)
                    if phase == 'train':
                        loss.backward()
                        clip_gradient(optimizer, 0.1)
                        optimizer.step()
                running_loss += loss.item() * t
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / (dataloader_length[phase])
            epoch_acc = running_corrects.double() / (dataloader_length[phase])
            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss,
                                                       epoch_acc))

            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
            if phase == 'val':
                val_acc_history.append(epoch_acc)
        save_checkpoint(epoch, best_model_wts, optimizer)
        print()
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    model.load_state_dict(best_model_wts)
    return model, val_acc_history
Пример #35
0
                "EM": np.round(valid_em / n_samples, 2),
                "F1": np.round(valid_f1 / n_samples, 2),
                "epoch": epoch + 1
            })
        print("Valid loss of the model at epoch {} is: {}".format(
            epoch + 1, np.round(valid_losses / len(valid_dataloader), 2)))
        print("Valid EM of the model at epoch {} is: {}".format(
            epoch + 1, np.round(valid_em / n_samples, 2)))
        print("Valid F1 of the model at epoch {} is: {}".format(
            epoch + 1, np.round(valid_f1 / n_samples, 2)))

    # save last model weights
    save_checkpoint(
        {
            "epoch": epoch + 1 + epoch_checkpoint,
            "state_dict": model.state_dict(),
            "best_valid_loss": np.round(valid_losses / len(valid_dataloader),
                                        2)
        }, True, os.path.join(experiment_path, "model_last_checkpoint.pkl"))

    # save model with best validation error
    is_best = bool(
        np.round(valid_losses / len(valid_dataloader), 2) < best_valid_loss)
    best_valid_loss = min(np.round(valid_losses / len(valid_dataloader), 2),
                          best_valid_loss)
    save_checkpoint(
        {
            "epoch": epoch + 1 + epoch_checkpoint,
            "state_dict": model.state_dict(),
            "best_valid_loss": best_valid_loss
        }, is_best, os.path.join(experiment_path, "model.pkl"))
	def save(self, prefix, epoch):
		arg_params = {}
		for name, tensor in self.arg_dict.items():
			if is_param_name(name):
				arg_params[name] = tensor
		save_checkpoint(prefix, epoch, arg_params)