Exemplo n.º 1
0
def init_opt(args, model, logger):
    if args.optimizer == 'adam':
        if args.transformer_lr:
            opt = torch.optim.Adam(model.params, lr=args.transformer_lr_multiply, betas=(0.9, 0.98), eps=1e-9,
                                   weight_decay=args.weight_decay)
            lr_lambda = partial(get_transformer_learning_rate, dimension=args.dimension, warmup=args.warmup)
            scheduler = torch.optim.lr_scheduler.LambdaLR(opt, lr_lambda)
        else:
            opt = torch.optim.Adam(model.params, lr=args.lr_rate, betas=(args.beta0, 0.999),
                                   weight_decay=args.weight_decay)
            scheduler = None
    elif args.optimizer == 'radam':
        import radam
        if args.transformer_lr:
            logger.warning('--transformer_lr has no effect with RAdam optimizer, warmup is never applied')
        opt = radam.RAdam(model.params, lr=args.lr_rate, betas=(args.beta0, 0.999), weight_decay=args.weight_decay)
        scheduler = None
    else:
        assert args.optimizer == 'sgd'
        if args.transformer_lr:
            opt = torch.optim.SGD(model.params, lr=args.transformer_lr_multiply, weight_decay=args.weight_decay, )
            lr_lambda = partial(get_sgd_learning_rate, warmup=args.warmup)
            scheduler = torch.optim.lr_scheduler.LambdaLR(opt, lr_lambda)
        else:
            opt = torch.optim.SGD(model.params, lr=args.lr_rate, weight_decay=args.weight_decay, )
            scheduler = None

    return opt, scheduler
Exemplo n.º 2
0
def init_opt(args, model, logger):
    if args.optimizer == 'adam':
        # Adam with transformer schedule has a different set of default hyperparameters:
        if args.lr_schedule == 'transformer':
            opt = torch.optim.Adam(model.params,
                                   lr=args.lr_multiply,
                                   betas=(0.9, 0.98),
                                   eps=1e-9,
                                   weight_decay=args.weight_decay)
        else:
            opt = torch.optim.Adam(model.params,
                                   lr=args.lr_multiply,
                                   betas=(args.beta0, 0.999),
                                   weight_decay=args.weight_decay)
    elif args.optimizer == 'adamw':
        opt = AdamW(model.params,
                    lr=args.lr_multiply,
                    weight_decay=args.weight_decay)
    elif args.optimizer == 'radam':
        import radam
        if args.warmup > 1:
            logger.warning('With RAdam optimizer, warmup is never applied')
        opt = radam.RAdam(model.params,
                          lr=args.lr_multiply,
                          betas=(args.beta0, 0.999),
                          weight_decay=args.weight_decay)
    else:
        assert args.optimizer == 'sgd'
        opt = torch.optim.SGD(model.params,
                              lr=args.lr_multiply,
                              weight_decay=args.weight_decay)

    if args.lr_schedule == 'transformer':
        lr_lambda = partial(get_transformer_learning_rate,
                            dimension=args.dimension,
                            warmup=args.warmup)
        scheduler = torch.optim.lr_scheduler.LambdaLR(opt, lr_lambda)
    elif args.lr_schedule == 'constant':
        scheduler = get_constant_schedule_with_warmup(
            opt, num_warmup_steps=args.warmup)
    elif args.lr_schedule == 'linear':
        scheduler = get_linear_schedule_with_warmup(
            opt,
            num_training_steps=sum(args.train_iterations) //
            args.gradient_accumulation_steps,
            num_warmup_steps=args.warmup)
    elif args.lr_schedule == 'cosine':
        scheduler = get_cosine_schedule_with_warmup(
            opt,
            num_training_steps=sum(args.train_iterations) //
            args.gradient_accumulation_steps,
            num_warmup_steps=args.warmup,
            num_cycles=0.5)
    elif args.lr_schedule == 'sgd':
        lr_lambda = partial(get_sgd_learning_rate, warmup=args.warmup)
        scheduler = torch.optim.lr_scheduler.LambdaLR(opt, lr_lambda)
    else:
        raise ValueError('Invalid learning rate scheduler.')

    return opt, scheduler
Exemplo n.º 3
0
def optimizer(net, args):
    assert args.optimizer.lower() in ["sgd", "adam", "radam"], "Invalid Optimizer"

    if args.optimizer.lower() == "sgd":
	       return optim.SGD(net.parameters(), lr=args.lr, momentum=args.beta1, nesterov=args.nesterov)
    elif args.optimizer.lower() == "adam":
	       return optim.Adam(net.parameters(), lr=args.lr, betas=(args.beta1, args.beta2))
    elif args.optimizer.lower() == "radam":
            return radam.RAdam(net.parameters(), lr=args.lr, betas=(args.beta1, args.beta2))
Exemplo n.º 4
0
    def init_fn(self):
        if self.options.model == 'flow':
            num_input_channels = self.options.n_time_bins * 2
            num_output_channels = 2
        elif self.options.model == 'recons':
            # For the reconstruction model, we sum the event volume across the time dimension, so
            # that the network only sees a single channel event input, plus the prev image.
            num_input_channels = 1 + self.options.n_image_channels
            num_output_channels = self.options.n_image_channels
        else:
            raise ValueError(
                "Class was initialized with an invalid model {}"
                ", only {EventGAN, flow, recons} are supported.".format(
                    self.options.model))

        self.cycle_unet = UNet(num_input_channels=num_input_channels,
                               num_output_channels=num_output_channels,
                               skip_type='concat',
                               activation='tanh',
                               num_encoders=4,
                               base_num_channels=32,
                               num_residual_blocks=2,
                               norm='BN',
                               use_upsample_conv=True,
                               multi=True)

        self.models_dict = {"model": self.cycle_unet}
        model_params = self.cycle_unet.parameters()

        optimizer = radam.RAdam(list(model_params),
                                lr=self.options.lrc,
                                weight_decay=self.options.wd,
                                betas=(self.options.lr_decay, 0.999))

        self.ssim = pytorch_ssim.SSIM()
        self.l1 = nn.L1Loss(reduction="mean")
        self.image_loss = lambda x, y: self.l1(x, y) - self.ssim(x, y)

        self.optimizers_dict = {"optimizer": optimizer}

        self.train_ds, self.train_sampler = event_loader.get_and_concat_datasets(
            self.options.train_file, self.options, train=True)
        self.validation_ds, self.validation_sampler = event_loader.get_and_concat_datasets(
            self.options.validation_file, self.options, train=False)

        self.cdl_kwargs["collate_fn"] = event_utils.none_safe_collate
        self.cdl_kwargs["sampler"] = self.train_sampler
Exemplo n.º 5
0
 def configure_optimizers(self):
     params = self.parameters()
     
     if isinstance(self._optimizer, dict):
         optimizer = radam.RAdam(params, **self._optimizer)
     else:
         optimizer = self._optimizer(params)
     
     if isinstance(self._scheduler, dict):
         scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, **self._scheduler)
     else:
         scheduler = self._scheduler(optimizer)
     
     return {
         'optimizer': optimizer,
         'lr_scheduler': scheduler
     }
Exemplo n.º 6
0
def main(cfg):
    net = SPRINSeg(6, cfg.fps_n).cuda()
    if len(cfg.resume_path) > 0:
        net.load_state_dict(
            torch.load(hydra.utils.to_absolute_path(cfg.resume_path)))
    opt = radam.RAdam(net.parameters(), cfg.lr, weight_decay=cfg.weight_decay)
    pcs_train, segs_centered_train, segs_train = read_data(
        hydra.utils.to_absolute_path('shapenet_part_seg_hdf5_data'),
        r'ply_data_(train|val).*\.h5')
    pcs_test, segs_centered_test, segs_test = read_data(
        hydra.utils.to_absolute_path('shapenet_part_seg_hdf5_data'),
        r'ply_data_test.*\.h5')

    print(len(pcs_train))
    print(len(pcs_test))

    for e in range(1, cfg.max_epoch):
        run_epoch(net,
                  pcs_train,
                  segs_centered_train,
                  segs_train,
                  opt,
                  e,
                  ds=cfg.npoints,
                  batchsize=cfg.batch_size)

        if e % 10 == 0:
            run_epoch(net,
                      pcs_test,
                      segs_centered_test,
                      segs_test,
                      opt,
                      e,
                      train=False,
                      ds=cfg.npoints,
                      batchsize=cfg.batch_size,
                      rand_rot=True)
            torch.save(net.state_dict(), 'epoch{}.pt'.format(e))
Exemplo n.º 7
0
def main():
    discount = 0.995
    unroll_steps = 5
    replay_buffer_size = 1000
    batch_size = 128

    env = TicTacToeEnv()
    agent = MuZeroAgent(discount=discount)
    replay = ReplayBuffer(replay_buffer_size, batch_size, unroll_steps)
    # optimizer = torch.optim.SGD(agent.network.parameters(), lr=1e-4, momentum=0.9, weight_decay=1e-6, nesterov=True)
    optimizer = radam.RAdam(agent.network.parameters(),
                            lr=1e-2,
                            weight_decay=1e-6)

    agent.load_model("muzero_model.pth")
    try:
        writer = SummaryWriter("./logs/MuZero")
        muzero(env, agent, replay, optimizer, writer)
    except KeyboardInterrupt:
        print("Keyboard interrupt")
    print("Train complete")
    validate(env, agent, True)
    agent.save_model("muzero_model.pth")
Exemplo n.º 8
0
seed = 20170705
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
train_file = "train_large.txt"
feature_sizes_file = "feature_sizes_large.txt"
debug = False
#train_file = "train.txt"
#feature_sizes_file = "feature_sizes.txt"
#debug = True

# load data
train_data = CriteoDataset('./data', train=True, train_file=train_file)

# split trani and valid set
train_idx, valid_idx = split_train_and_valid(train_data, debug)

# loader
loader_train = DataLoader(train_data, batch_size=256, sampler=sampler.SubsetRandomSampler(train_idx), num_workers=0)
loader_val = DataLoader(train_data, batch_size=1000, sampler=sampler.SubsetRandomSampler(valid_idx), num_workers=0)

feature_sizes = np.loadtxt('./data/{}'.format(feature_sizes_file), delimiter=',')
feature_sizes = [int(x) for x in feature_sizes]
print(feature_sizes)

model = DeepFM(feature_sizes, use_cuda=True, overfitting=debug)
#optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=0.0)
optimizer = radam.RAdam(model.parameters(), lr=1e-3, weight_decay=0.0)
model.fit(loader_train, loader_val, optimizer, epochs=1000, verbose=True, print_every=1000, checkpoint_dir="./chkp")
def train_fold(fold_idx, work_dir, train_filenames, test_filenames,
               batch_sampler, epoch, epochs_to_train):
    os.makedirs(work_dir, exist_ok=True)
    fold_logger = kfold.FoldLogger(work_dir)
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    # device = 'cpu'
    batch_size = 4

    # model = models.UNet(6, 1)
    # model = models.MyResNetModel()
    model = models.ResNetUNet(n_classes=1, upsample=True)
    # model = models.ResNetUNetPlusPlus(n_classes=1)
    # model = models.EfficientUNet(n_classes=1)

    # model = models.HRNetWithClassifier()

    model.to(device)
    model = torch.nn.DataParallel(model)
    # model.to(device)

    data_patallel_multiplier = max(1, torch.cuda.device_count())
    # data_patallel_multiplier = 1
    print('data_parallel_multiplier =', data_patallel_multiplier)

    img_size = 1024

    train_dataset = datareader.SIIMDataset('data/dicom-images-train',
                                           'data/train-rle.csv',
                                           ([img_size], [img_size]),
                                           augment=True,
                                           filenames_whitelist=train_filenames)
    # if batch_sampler is None:
    #     batch_sampler = samplers.OnlineHardBatchSampler(train_dataset, batch_size * data_patallel_multiplier,
    #                                                    drop_last=False)
    # train_dataloader = torch.utils.data.DataLoader(train_dataset, num_workers=os.cpu_count(),
    #                                                batch_sampler=batch_sampler)
    train_dataloader = torch.utils.data.DataLoader(train_dataset,
                                                   batch_size=batch_size *
                                                   data_patallel_multiplier,
                                                   shuffle=True,
                                                   num_workers=os.cpu_count())

    val_dataset = datareader.SIIMDataset('data/dicom-images-train',
                                         'data/train-rle.csv',
                                         ([img_size], [img_size]),
                                         filenames_whitelist=test_filenames)
    val_dataloader = torch.utils.data.DataLoader(val_dataset,
                                                 batch_size=batch_size *
                                                 data_patallel_multiplier,
                                                 shuffle=False,
                                                 num_workers=os.cpu_count())

    trainable_params = [
        param for param in model.parameters() if param.requires_grad
    ]

    lr_scaling_coefficient = (1 /
                              16) * data_patallel_multiplier * batch_size / 10
    # max_lr = 2e-3 * lr_scaling_coefficient
    # base_lr = 5e-5 * lr_scaling_coefficient

    # OHEM Limited loss works with that divided by 10
    max_lr = 2.5e-4 * lr_scaling_coefficient
    base_lr = 3.5e-5 * lr_scaling_coefficient

    # optim = torch.optim.Adam(params=trainable_params, lr=base_lr, betas=(0.0, 0.9))
    # optim = torch.optim.Adam(params=[
    #     {"params": backbone_parameters, "lr": base_lr},
    #     {"params": head_and_classifier_params, "lr": max_lr}], lr=base_lr)
    # optim = torch.optim.Adam(params=trainable_params, lr=max_lr)
    # optim = torch.optim.AdamW(params=trainable_params, lr=base_lr, weight_decay=0.00001)
    optim = radam.RAdam(params=trainable_params,
                        lr=base_lr,
                        weight_decay=0.0001)
    optim = torchcontrib.optim.SWA(optim)
    # optim = torch.optim.SGD(params=trainable_params,
    #                         momentum=0.98,
    #                         nesterov=True,
    #                         lr=base_lr)
    # optim = torch.optim.SGD(params=trainable_params,
    #                         momentum=0.9,
    #                         nesterov=True,
    #                         lr=base_lr)

    best_metric = 0.0
    _, loaded_best_metric = utils.try_load_checkpoint(work_dir,
                                                      model,
                                                      device,
                                                      optimizer=optim,
                                                      load_optimizer=True)
    if loaded_best_metric is not None: best_metric = loaded_best_metric

    # Experiments show that it often is good to set stepsize equal to 2 − 10 times the number of iterations in an epoch.
    # For example, setting stepsize = 8 ∗ epoch with the CIFAR-10 training run(as shown in Figure 1) only gives slightly
    # better results than setting stepsize = 2 ∗ epoch. (https://arxiv.org/pdf/1506.01186.pdf)
    # cycle_len = 4 == stepsize = 2
    # in my implementation
    epochs_per_cycle = 20
    lr_scheduler = lr_utils.CyclicalLR(max_lr=max_lr,
                                       base_lr=base_lr,
                                       steps_per_epoch=len(train_dataloader),
                                       epochs_per_cycle=epochs_per_cycle,
                                       mode='cosine')
    lr_scheduler.step_value = epoch * len(train_dataloader)

    steps_per_epoch = len(train_dataloader)
    # torch.optim.lr_scheduler.CyclicLR(optimizer=optim, base_lr=base_lr, max_lr=max_lr, step_size_up=steps_per_epoch * 1,
    #                                   step_size_down=steps_per_epoch * 4, mode='triangular', gamma=1.0, scale_fn=None,
    #                                   scale_mode='cycle',
    #                                   cycle_momentum=False, base_momentum=0.8, max_momentum=0.9,
    #                                   last_epoch=-1)

    # model, optimizer = amp.initialize(model, optim, opt_level='O0')

    writer = SummaryWriter(work_dir)
    for i in range(epochs_to_train):
        train_result_dict = train_one_epoch(model=model,
                                            optimizer=optim,
                                            data_loader=train_dataloader,
                                            device=device,
                                            epoch=epoch,
                                            lr_scheduler=lr_scheduler,
                                            summary_writer=writer,
                                            print_freq=100)

        val_result_dict = validate.validate(model, val_dataloader, device)
        mask_thresh, mask_score = val_result_dict['best_mask_score']
        class_thresh, class_score = val_result_dict['best_class_score']
        global_step = epoch * len(train_dataloader)
        writer.add_scalar('dice', mask_score, global_step=global_step)
        writer.add_scalar('classification_accuracy',
                          class_score,
                          global_step=global_step)
        writer.add_scalar('mean_epoch_loss',
                          train_result_dict['loss'],
                          global_step=global_step)
        writer.add_scalar('epoch', epoch, global_step=global_step)

        # {'best_mask_score': best_mask_score, 'mean_mask_scores': mean_mask_scores,
        #  'best_class_score': best_class_score, 'mean_class_scores': mean_class_scores}
        log_data = {
            'score': val_result_dict['best_mask_score'][1],
            'mask_threshold': val_result_dict['best_mask_score'][0],
            'class_accuracy': val_result_dict['best_class_score'][1],
            'class_thresold': val_result_dict['best_class_score'][0]
        }
        if (epoch + 1) % epochs_per_cycle == 0 and epoch != 0:
            print('Updating SWA running average')
            optim.update_swa()
        epoch += 1
        break

    # if mask_score > best_metric:
    #     best_metric = mask_score
    # if epoch % epochs_per_cycle == 0:
    fold_logger.log_epoch(epoch - 1, log_data)
    utils.save_checkpoint(output_dir=work_dir,
                          epoch=epoch - 1,
                          model=model,
                          optimizer=optim,
                          best_metric=best_metric)

    if (epoch) % epochs_per_cycle == 0 and epoch != 0:
        optim.swap_swa_sgd()
        print('Swapped SWA buffers')
        print('Updating BatchNorm statistics...')
        optim.bn_update(
            utils.dataloader_image_extract_wrapper(train_dataloader), model,
            device)
        print('Updated BatchNorm statistics')
        print('Validating SWA model...')
        val_result_dict = validate.validate(model, val_dataloader, device)
        log_data = {
            'score': val_result_dict['best_mask_score'][1],
            'mask_threshold': val_result_dict['best_mask_score'][0],
            'class_accuracy': val_result_dict['best_class_score'][1],
            'class_thresold': val_result_dict['best_class_score'][0]
        }
        fold_logger.log_epoch('swa', log_data)
        print('Saved SWA model')
        utils.save_checkpoint(output_dir=work_dir,
                              epoch=None,
                              name='swa',
                              model=model,
                              optimizer=optim,
                              best_metric=best_metric)

    return {
        'mask_score': mask_score,
        'class_score': class_score,
        'global_step': global_step,
        'batch_sampler': batch_sampler
    }
Exemplo n.º 10
0
    def init_fn(self):
        # build model
        self.generator, self.discriminator = build_gan(self.options)

        self.models_dict = {"gen": self.generator, "dis": self.discriminator}
        if not self.is_training:
            self.optimizers_dict = {}
            return

        if self.options.cycle_recons:
            model_folder = "EventGAN/pretrained_models/{}".format(
                self.options.cycle_recons_model)
            checkpoint = os.path.join(model_folder,
                                      os.listdir(model_folder)[-1])
            self.cycle_unet_recons = torch.load(checkpoint)
            self.cycle_unet_recons.eval()
            self.models_dict["e2i"] = self.cycle_unet_recons
        if self.options.cycle_flow:
            model_folder = "EventGAN/pretrained_models/{}".format(
                self.options.cycle_flow_model)
            checkpoint = os.path.join(model_folder,
                                      os.listdir(model_folder)[-1])
            self.cycle_unet_flow = torch.load(checkpoint)
            self.cycle_unet_flow.eval()
            self.models_dict["e2f"] = self.cycle_unet_flow

        # params for each part of the network
        dis_params = filter(lambda p: p.requires_grad,
                            self.discriminator.parameters())
        gen_params = filter(lambda p: p.requires_grad,
                            self.generator.parameters())
        gen_params = self.generator.parameters()

        optimizer_dis = radam.RAdam(dis_params,
                                    lr=self.options.lrd,
                                    weight_decay=0.,
                                    betas=(0., 0.999))

        optimizer_gen = radam.RAdam(list(gen_params),
                                    lr=self.options.lrg,
                                    weight_decay=0.,
                                    betas=(0., 0.999))

        self.ssim = pytorch_ssim.SSIM()
        self.secondary_l1 = nn.L1Loss(reduction="mean")
        self.image_loss = lambda x, y: self.secondary_l1(x, y) - self.ssim(
            x, y)

        self.optimizers_dict = {
            "optimizer_gen": optimizer_gen,
            "optimizer_dis": optimizer_dis
        }

        self.train_ds, self.train_sampler = event_loader.get_and_concat_datasets(
            self.options.train_file, self.options, train=True)
        self.validation_ds, self.validation_sampler = event_loader.get_and_concat_datasets(
            self.options.validation_file, self.options, train=False)

        self.cdl_kwargs["collate_fn"] = event_utils.none_safe_collate
        self.cdl_kwargs["sampler"] = self.train_sampler

        self.prev_gen_losses = {}
        self.prev_dis_losses = {}
        self.prev_gen_outputs = {}
        self.prev_dis_outputs = {}
Exemplo n.º 11
0
import torch
import torch.nn.functional as F
from utils import prepare_cifar
import tqdm
import radam
from vgg import vgg13_bn
from models import PreActResNet18
from aegleseeker import AegleSeeker
from eval_model import eval_model_pgd

device = 'cuda:0'
model = vgg13_bn()
model = AegleSeeker(model).to(device)
train_loader, test_loader = prepare_cifar(100, 100)
optim = radam.RAdam(model.parameters())
epsilon = 8 / 255

for epoch in range(100):
    with tqdm.tqdm(train_loader) as train:
        running_loss = 0.0
        running_grad = 0.0
        running_acc = 0.0
        model.train()
        for i, (x, y) in enumerate(train):
            x, y = x.to(device), y.to(device)
            # x_bu = x.detach().clone()
            for _ in range(1):
                x_rg = x.detach().clone().requires_grad_(True) + \
                    torch.randn_like(x) * epsilon / 2
                optim.zero_grad()
                pred = model(x_rg)
Exemplo n.º 12
0
testdataloader = torch.utils.data.DataLoader(testdataset,
                                             batch_size=4,
                                             shuffle=True)
nb_classes = len(traindataset.classes)

viz = visdom.Visdom()

m = model.Model(nb_classes, 64)
m = m.cuda()
m.load_state_dict(torch.load('classifier_model.pt'))
print(m)

initial_learning_rate = 100 / sum(p.numel()
                                  for p in m.parameters() if p.requires_grad)
print("Initail Learning rate", initial_learning_rate)
optim = radam.RAdam(m.parameters(), lr=initial_learning_rate)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optim,
                                                       'min',
                                                       factor=0.5,
                                                       verbose=True)
#optim.load_state_dict(torch.load('classifier_optim.pt'))
trainlm = lossManager.LossManager(displayEvery=100, win="Train Losses")
testlm = lossManager.LossManager(displayEvery=1, win="Test Losses")


def train(m, optim, dataset):
    dataloader = torch.utils.data.DataLoader(nonechucks.SafeDataset(dataset),
                                             batch_size=16,
                                             shuffle=True,
                                             num_workers=16)
    m.train()
Exemplo n.º 13
0
        print("ADVERSARIAL")
        adv_optimizer = torch.optim.SGD(adv_hidden.parameters(),
                                        lr=args.adv_lr,
                                        weight_decay=args.adv_wdecay)
    #optimizer = torch.optim.ASGD(model.parameters(), lr=args.lr, t0=0, lambd=0., weight_decay=args.wdecay)
    # Ensure the optimizer is optimizing params, which includes both the model's weights as well as the criterion's weight (i.e. Adaptive Softmax)
    if args.optimizer == 'sgd':
        optimizer = torch.optim.SGD(params,
                                    lr=args.lr,
                                    weight_decay=args.wdecay)
    elif args.optimizer == 'adam':
        optimizer = torch.optim.Adam(params,
                                     lr=args.lr,
                                     weight_decay=args.wdecay)
    elif args.optimizer == 'radam':
        optimizer = radam.RAdam(params, lr=args.lr, weight_decay=args.wdecay)
    else:
        raise Exception("Bad value %s for optimizer type" % args.optimizer)

    epoch_start_time = time.time()
    epoch = 0
    val_loss2 = evaluate(val_data)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
          'valid ppl {:8.2f} | valid bpc {:8.3f}'.format(
              epoch, (time.time() - epoch_start_time), val_loss2,
              math.exp(val_loss2), val_loss2 / math.log(2)))
    print('-' * 89)

    print("MAX EPOCH = ", args.epochs + 1)
    for epoch in range(args.start, args.epochs + 1):
Exemplo n.º 14
0
def train_cv(input_directory, output_directory):
    # model
    # 模型保存文件夹
    model_save_dir = '%s/%s_%s' % (
        config.ckpt, config.model_name + "_cv", time.strftime("%Y%m%d%H%M")
    )  #'%s/%s_%s' % (config.ckpt, args.model_name+"_cv", time.strftime("%Y%m%d%H%M"))
    for fold in range(config.kfold):
        print("***************************fold : {}***********************".
              format(fold))
        model = getattr(models, config.model_name)(fold=fold)
        # if args.ckpt and not args.resume:
        #     state = torch.load(args.ckpt, map_location='cpu')
        #     model.load_state_dict(state['state_dict'])
        #     print('train with pretrained weight val_f1', state['f1'])

        num_ftrs = model.fc.in_features
        model.fc = nn.Linear(num_ftrs, config.num_classes)

        #2019/11/11
        #save dense/fc weight for pretrain 55 classes
        # model = MyModel()
        # num_ftrs = model.classifier.out_features
        # model.fc = nn.Linear(55, config.num_classes)

        model = model.to(device)
        # data
        train_dataset = ECGDataset(data_path=config.train_data_cv.format(fold),
                                   data_dir=input_directory,
                                   train=True)

        train_dataloader = DataLoader(train_dataset,
                                      batch_size=config.batch_size,
                                      shuffle=True,
                                      drop_last=True,
                                      num_workers=6)

        val_dataset = ECGDataset(data_path=config.train_data_cv.format(fold),
                                 data_dir=input_directory,
                                 train=False)

        val_dataloader = DataLoader(val_dataset,
                                    batch_size=config.batch_size,
                                    drop_last=True,
                                    num_workers=4)

        print("fold_{}_train_datasize".format(fold), len(train_dataset),
              "fold_{}_val_datasize".format(fold), len(val_dataset))
        # optimizer and loss
        optimizer = radam.RAdam(
            model.parameters(),
            lr=config.lr)  #optim.Adam(model.parameters(), lr=config.lr)
        w = torch.tensor(train_dataset.wc, dtype=torch.float).to(device)
        criterion = utils.WeightedMultilabel(w)  ## utils.FocalLoss() #
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                         'max',
                                                         verbose=True,
                                                         factor=0.1,
                                                         patience=5,
                                                         min_lr=1e-06,
                                                         eps=1e-08)

        # if args.ex: model_save_dir += args.ex
        # best_f1 = -1
        # lr = config.lr
        # start_epoch = 1
        # stage = 1

        best_f1 = -1
        best_cm = -1
        lr = config.lr
        start_epoch = 1
        stage = 1
        # 从上一个断点,继续训练
        #         if args.resume:
        #             if os.path.exists(args.ckpt):  # 这里是存放权重的目录
        #                 model_save_dir = args.ckpt
        #                 current_w = torch.load(os.path.join(args.ckpt, config.current_w))
        #                 best_w = torch.load(os.path.join(model_save_dir, config.best_w))
        #                 best_f1 = best_w['loss']
        #                 start_epoch = current_w['epoch'] + 1
        #                 lr = current_w['lr']
        #                 stage = current_w['stage']
        #                 model.load_state_dict(current_w['state_dict'])
        #                 # 如果中断点恰好为转换stage的点
        #                 if start_epoch - 1 in config.stage_epoch:
        #                     stage += 1
        #                     lr /= config.lr_decay
        #                     utils.adjust_learning_rate(optimizer, lr)
        #                     model.load_state_dict(best_w['state_dict'])
        #                 print("=> loaded checkpoint (epoch {})".format(start_epoch - 1))
        logger = Logger(logdir=model_save_dir, flush_secs=2)
        # =========>开始训练<=========
        for epoch in range(start_epoch, config.max_epoch + 1):
            since = time.time()
            train_loss, train_acc, train_f1, train_f2, train_g2, train_cm = train_epoch(
                model,
                optimizer,
                criterion,
                train_dataloader,
                show_interval=100)
            val_loss, val_acc, val_f1, val_f2, val_g2, val_cm = val_epoch(
                model, criterion, val_dataloader)

            # train_loss, train_f1 = train_beat_epoch(model, optimizer, criterion, train_dataloader, show_interval=100)
            # val_loss, val_f1 = val_beat_epoch(model, criterion, val_dataloader)

            print('#epoch:%02d, stage:%d, train_loss:%.3e, train_acc:%.3f, train_f1:%.3f, train_f2:%.3f, train_g2:%.3f,train_cm:%.3f,\n \
                    val_loss:%0.3e, val_acc:%.3f, val_f1:%.3f, val_f2:%.3f, val_g2:%.3f, val_cm:%.3f,time:%s\n'
                  % (epoch, stage, train_loss, train_acc,train_f1,train_f2,train_g2,train_cm, \
                    val_loss, val_acc, val_f1, val_f2, val_g2, val_cm,utils.print_time_cost(since)))

            logger.log_value('fold{}_train_loss'.format(fold),
                             train_loss,
                             step=epoch)
            logger.log_value('fold{}_train_f1'.format(fold),
                             train_f1,
                             step=epoch)
            logger.log_value('fold{}_val_loss'.format(fold),
                             val_loss,
                             step=epoch)
            logger.log_value('fold{}_val_f1'.format(fold), val_f1, step=epoch)
            state = {
                "state_dict": model.state_dict(),
                "epoch": epoch,
                "loss": val_loss,
                'f1': val_f1,
                'lr': lr,
                'stage': stage
            }

            save_ckpt_cv(state, best_cm < val_cm, model_save_dir, fold,
                         output_directory)
            best_cm = max(best_cm, val_cm)

            scheduler.step(val_cm)
            # scheduler.step()

            if val_cm < best_cm:
                epoch_cum += 1
            else:
                epoch_cum = 0

            # save_ckpt_cv(state, best_f1 < val_f1, model_save_dir,fold)
            # best_f1 = max(best_f1, val_f1)

            # if val_f1 < best_f1:
            #     epoch_cum += 1
            # else:
            #     epoch_cum = 0

            # if epoch in config.stage_epoch:
            # if epoch_cum == 5:
            #     stage += 1
            #     lr /= config.lr_decay
            #     if lr < 1e-6:
            #         lr = 1e-6
            #         print("*" * 20, "step into stage%02d lr %.3ef" % (stage, lr))
            #     best_w = os.path.join(model_save_dir, config.best_w_cv.format(fold))
            #     model.load_state_dict(torch.load(best_w)['state_dict'])
            #     print("*" * 10, "step into stage%02d lr %.3ef" % (stage, lr))
            #     utils.adjust_learning_rate(optimizer, lr)

            # elif epoch_cum >= 12:
            #     print("*" * 20, "step into stage%02d lr %.3ef" % (stage, lr))
            #     break

            if epoch_cum >= 12:
                print("*" * 20, "step into stage%02d lr %.3ef" % (stage, lr))
                break
Exemplo n.º 15
0
def train(input_directory, output_directory):
    # model
    model = getattr(models, config.model_name)()

    # if args.ckpt and not args.resume:
    #     state = torch.load(args.ckpt, map_location='cpu')
    #     model.load_state_dict(state['state_dict'])
    #     print('train with pretrained weight val_f1', state['f1'])

    num_ftrs = model.fc.in_features
    model.fc = nn.Linear(num_ftrs, config.num_classes)

    model = model.to(device)
    # data
    train_dataset = ECGDataset(data_path=config.train_data,
                               data_dir=input_directory,
                               train=True)
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=config.batch_size,
                                  shuffle=True,
                                  num_workers=6)
    val_dataset = ECGDataset(data_path=config.train_data,
                             data_dir=input_directory,
                             train=False)
    val_dataloader = DataLoader(val_dataset,
                                batch_size=config.batch_size,
                                num_workers=4)

    print("train_datasize", len(train_dataset), "val_datasize",
          len(val_dataset))
    # optimizer and loss
    #optimizer = optim.Adam(model.parameters(), lr=config.lr)
    optimizer = radam.RAdam(model.parameters(),
                            lr=config.lr,
                            weight_decay=1e-4)  #config.lr
    #optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, dampening=0, weight_decay=1e-4, nesterov=False)
    w = torch.tensor(train_dataset.wc, dtype=torch.float).to(device)
    criterion = utils.WeightedMultilabel(w)  ##   # utils.FocalLoss() #

    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        'max',
        verbose=True,
        factor=0.1,
        patience=5,
        min_lr=1e-06,
        eps=1e-08)  #CosineAnnealingLR  CosineAnnealingWithRestartsLR
    #scheduler = pytorchtools.CosineAnnealingWithRestartsLR(optimizer,T_max=30, T_mult = 1.2, eta_min=1e-6)

    # optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, nesterov=True)
    # scheduler = pytorchtools.CosineAnnealingLR_with_Restart(optimizer, T_max=12, T_mult=1, model=model, out_dir='./snapshot',take_snapshot=True, eta_min=1e-9)

    # 模型保存文件夹
    model_save_dir = '%s/%s_%s' % (config.ckpt, config.model_name,
                                   time.strftime("%Y%m%d%H%M"))

    # if args.ex: model_save_dir += args.ex

    best_f1 = -1
    best_cm = -1
    lr = config.lr
    start_epoch = 1
    stage = 1

    # 从上一个断点,继续训练
    # if args.resume:
    #     if os.path.exists(args.ckpt):  # 这里是存放权重的目录
    #         model_save_dir = args.ckpt
    #         current_w = torch.load(os.path.join(args.ckpt, config.current_w))
    #         best_w = torch.load(os.path.join(model_save_dir, config.best_w))
    #         best_f1 = best_w['loss']
    #         start_epoch = current_w['epoch'] + 1
    #         lr = current_w['lr']
    #         stage = current_w['stage']
    #         model.load_state_dict(current_w['state_dict'])
    #         # 如果中断点恰好为转换stage的点
    #         if start_epoch - 1 in config.stage_epoch:
    #             stage += 1
    #             lr /= config.lr_decay
    #             utils.adjust_learning_rate(optimizer, lr)
    #             model.load_state_dict(best_w['state_dict'])
    #         print("=> loaded checkpoint (epoch {})".format(start_epoch - 1))

    logger = Logger(logdir=model_save_dir, flush_secs=2)
    # =========>开始训练<=========
    for epoch in range(start_epoch, config.max_epoch + 1):
        since = time.time()
        train_loss, train_acc, train_f1, train_f2, train_g2, train_cm = train_epoch(
            model, optimizer, criterion, train_dataloader, show_interval=100)
        val_loss, val_acc, val_f1, val_f2, val_g2, val_cm = val_epoch(
            model, criterion, val_dataloader)

        # train_loss, train_f1 = train_beat_epoch(model, optimizer, criterion, train_dataloader, show_interval=100)
        # val_loss, val_f1 = val_beat_epoch(model, criterion, val_dataloader)

        print('#epoch:%02d, stage:%d, train_loss:%.3e, train_acc:%.3f, train_f1:%.3f, train_f2:%.3f, train_g2:%.3f,train_cm:%.3f,\n \
                val_loss:%0.3e, val_acc:%.3f, val_f1:%.3f, val_f2:%.3f, val_g2:%.3f, val_cm:%.3f,time:%s\n'
              % (epoch, stage, train_loss, train_acc,train_f1,train_f2,train_g2,train_cm, \
                val_loss, val_acc, val_f1, val_f2, val_g2, val_cm,utils.print_time_cost(since)))

        logger.log_value('train_loss', train_loss, step=epoch)
        logger.log_value('train_f1', train_f1, step=epoch)
        logger.log_value('val_loss', val_loss, step=epoch)
        logger.log_value('val_f1', val_f1, step=epoch)
        state = {
            "state_dict": model.state_dict(),
            "epoch": epoch,
            "loss": val_loss,
            'f1': val_f1,
            'lr': lr,
            'stage': stage
        }

        save_ckpt(state, best_cm < val_cm, model_save_dir, output_directory)
        best_cm = max(best_cm, val_cm)

        scheduler.step(val_cm)
        # scheduler.step()

        if val_cm < best_cm:
            epoch_cum += 1
        else:
            epoch_cum = 0


#         # if epoch in config.stage_epoch:
#         if epoch_cum == 5:
#             stage += 1
#             lr /= config.lr_decay
#             if lr < 1e-6:
#                 lr = 1e-6
#                 print("*" * 20, "step into stage%02d lr %.3ef" % (stage, lr))
#             best_w = os.path.join(model_save_dir, config.best_w)
#             model.load_state_dict(torch.load(best_w)['state_dict'])
#             print("*" * 10, "step into stage%02d lr %.3ef" % (stage, lr))
#             utils.adjust_learning_rate(optimizer, lr)

#         elif epoch_cum >= 12:
#             print("*" * 20, "step into stage%02d lr %.3ef" % (stage, lr))
#             break

        if epoch_cum >= 12:
            print("*" * 20, "step into stage%02d lr %.3ef" % (stage, lr))
            break
Exemplo n.º 16
0
        ngpu = torch.cuda.device_count()
        device_ids = list(range(ngpu))
        model = torch.nn.DataParallel(model, device_ids)
        model.cuda()
    else:
        model.to(DEVICE)

    model.apply(init_weight)
    model.train()

    max_lr = 1e-3
    warmup_step = hp.warmup_step
    warmup_factor = hp.warmup_factor
    if hp.optimizer.lower() == 'radam':
        import radam
        optimizer = radam.RAdam(model.parameters(), lr=max_lr, betas=(0.9, 0.98), eps=1e-9)
    else:
        optimizer = torch.optim.Adam(model.parameters(), lr=max_lr, betas=(0.9, 0.98), eps=1e-9)

    save_dir = hp.save_dir # save dir name
    os.makedirs(save_dir, exist_ok=True)
    if hp_file != f'{save_dir}/hparams.py' and not filecmp.cmp(hp_file, f'{save_dir}/hparams.py'):
        shutil.copyfile(hp_file, f'{save_dir}/hparams.py')
    writer = SummaryWriter(f'{hp.log_dir}/logs/{hp.comment}')

    if hp.output_type == 'softmax':
        dataset_train = datasets.VQWav2vecTrainDatasets(hp.train_script)
        collate_fn_transformer = datasets.collate_fn_vqwav2vec
    else:
        dataset_train = datasets.get_dataset(hp.train_script)
        collate_fn_transformer = datasets.collate_fn