Exemplo n.º 1
0
 def __init__(self, data: DatasetCollection,
              config: MutableMapping) -> None:
     self.model = DeepClassiflie(config)
     self.swa_model = swa_utils.AveragedModel(self.model)
     if config.trainer.dump_model_thaw_sched_only:
         dump_default_thawing_schedule(
             self.model, f"{config.experiment.dc_base}/thaw_schedules")
         sys.exit(0)
     self.data = data
     self.training_session = TrainingSession(
         config, self.model.__class__.__name__,
         self.data.dataset_conf['num_train_recs'],
         self.data.dataset_conf['train_batch_size'])
     if self.training_session.config.trainer.histogram_vars:
         self.training_session.histogram_vars = {
             n: p
             for (n, p) in self.model.named_parameters()
             if any(n == v for v in
                    self.training_session.config.trainer.histogram_vars)
         }
     self.optimizer = self.init_optimizer()
     self.tokenizer = self.data.dataset_conf['albert_tokenizer']
     self.datasets = {
         'train': self.data.dataset_conf['train_ds'],
         'val': self.data.dataset_conf['val_ds'],
         'test': self.data.dataset_conf['test_ds']
     }
Exemplo n.º 2
0
    def __init__(self,
                 train_dataLoader,
                 val_dataLoader,
                 test_dataLoader,
                 dir_name,
                 device="cuda:0",
                 batch_size=4,
                 n_outputs=13,
                 learning_rate=3e-3,
                 num_epochs=200,
                 output_path='./outputs/',
                 detect=False):
        self.device = torch.device(device)
        torch.cuda.set_device(self.device)
        self.train_dataLoader = train_dataLoader
        self.val_dataLoader = val_dataLoader
        self.test_dataLoader = test_dataLoader
        self.model = cm2.customUNet(n_outputs=n_outputs,
                                    classifier=False).cuda()
        self.optimizer = Adam(self.model.parameters(), lr=learning_rate)
        #* Stochastic Weight Averaging (https://pytorch.org/docs/stable/optim.html#putting-it-all-together)
        self.swa_model = swa.AveragedModel(self.model)
        self.swa_scheduler = swa.SWALR(self.optimizer,
                                       swa_lr=0.05)  # LR set to large value
        self.swa_start = 100  # START EPOCH from SWA

        self.criterion = nn.BCEWithLogitsLoss().cuda()
        self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            self.optimizer, mode='min', verbose=True)
        self.num_epochs = num_epochs
        self.writer = cw2.customWriter(log_dir=f'./runs/{dir_name}',
                                       batch_size=batch_size,
                                       num_classes=n_outputs)
        self.best_loss = 10000  # Initialise loss for saving best model
        self.output_path = output_path
Exemplo n.º 3
0
 def train_begin(self):
     self.swa_model = swa_utils.AveragedModel(self.trainer.model)
     swa_epochs = self.trainer.config.OPTIM.EPOCH - self.epoch_start
     anneal_epoch = int(swa_epochs * self.anneal_epoch)
     swa_lrs = [self.swa_lr]
     for pg in self.trainer.optim.param_groups[1:]:
         swa_lrs.append(pg['lr'] / 100)
     self.swa_scheduler = swa_utils.SWALR(self.trainer.optim,
                                          swa_lr=swa_lrs,
                                          anneal_epochs=anneal_epoch)
Exemplo n.º 4
0
def main(
        # Architectural hyperparameters. These are quite small for illustrative purposes.
        initial_noise_size=5,  # How many noise dimensions to sample at the start of the SDE.
        noise_size=3,  # How many dimensions the Brownian motion has.
        hidden_size=32,  # How big the hidden size of the generator SDE and the discriminator CDE are.
        mlp_size=16,  # How big the layers in the various MLPs are.
        num_layers=1,  # How many hidden layers to have in the various MLPs.

        # Training hyperparameters. Be prepared to tune these very carefully, as with any GAN.
    ratio=5,  # How many discriminator training steps to take per generator training step.
        gp_coeff=10,  # How much to regularise with gradient penalty.
        lr=1e-3,  # Learning rate often needs careful tuning to the problem.
        batch_size=1024,  # Batch size.
        steps=6000,  # How many steps to train both generator and discriminator for.
        init_mult1=3,  # Changing the initial parameter size can help.
        init_mult2=0.5,  #
        weight_decay=0.01,  # Weight decay.
        swa_step_start=500,  # When to start using stochastic weight averaging.

        # Evaluation and plotting hyperparameters
    steps_per_print=10,  # How often to print the loss.
        num_plot_samples=50,  # How many samples to use on the plots at the end.
        plot_locs=(
            0.1, 0.3, 0.5, 0.7, 0.9
        ),  # Plot some marginal distributions at this proportion of the way along.
):
    is_cuda = torch.cuda.is_available()
    device = 'cuda' if is_cuda else 'cpu'
    if not is_cuda:
        print(
            "Warning: CUDA not available; falling back to CPU but this is likely to be very slow."
        )

    # Data
    ts, data_size, train_dataloader = get_data(batch_size=batch_size,
                                               device=device)
    infinite_train_dataloader = (elem
                                 for it in iter(lambda: train_dataloader, None)
                                 for elem in it)

    # Models
    generator = Generator(data_size, initial_noise_size, noise_size,
                          hidden_size, mlp_size, num_layers).to(device)
    discriminator = Discriminator(data_size, hidden_size, mlp_size,
                                  num_layers).to(device)
    # Weight averaging really helps with GAN training.
    averaged_generator = swa_utils.AveragedModel(generator)
    averaged_discriminator = swa_utils.AveragedModel(discriminator)

    # Picking a good initialisation is important!
    # In this case these were picked by making the parameters for the t=0 part of the generator be roughly the right
    # size that the untrained t=0 distribution has a similar variance to the t=0 data distribution.
    # Then the func parameters were adjusted so that the t>0 distribution looked like it had about the right variance.
    # What we're doing here is very crude -- one can definitely imagine smarter ways of doing things.
    # (e.g. pretraining the t=0 distribution)
    with torch.no_grad():
        for param in generator._initial.parameters():
            param *= init_mult1
        for param in generator._func.parameters():
            param *= init_mult2

    # Optimisers. Adadelta turns out to be a much better choice than SGD or Adam, interestingly.
    generator_optimiser = torch.optim.Adadelta(generator.parameters(),
                                               lr=lr,
                                               weight_decay=weight_decay)
    discriminator_optimiser = torch.optim.Adadelta(discriminator.parameters(),
                                                   lr=lr,
                                                   weight_decay=weight_decay)

    # Train both generator and discriminator.
    trange = tqdm.tqdm(range(steps))
    for step in trange:
        train_generator(ts, batch_size, generator, discriminator,
                        generator_optimiser, discriminator_optimiser)
        for _ in range(ratio):
            real_samples, = next(infinite_train_dataloader)
            train_discriminator(ts, batch_size, real_samples, generator,
                                discriminator, discriminator_optimiser,
                                gp_coeff)

        # Stochastic weight averaging typically improves performance.
        if step > swa_step_start:
            averaged_generator.update_parameters(generator)
            averaged_discriminator.update_parameters(discriminator)

        if (step % steps_per_print) == 0 or step == steps - 1:
            total_unaveraged_loss = evaluate_loss(ts, batch_size,
                                                  train_dataloader, generator,
                                                  discriminator)
            if step > swa_step_start:
                total_averaged_loss = evaluate_loss(
                    ts, batch_size, train_dataloader,
                    averaged_generator.module, averaged_discriminator.module)
                trange.write(
                    f"Step: {step:3} Loss (unaveraged): {total_unaveraged_loss:.4f} "
                    f"Loss (averaged): {total_averaged_loss:.4f}")
            else:
                trange.write(
                    f"Step: {step:3} Loss (unaveraged): {total_unaveraged_loss:.4f}"
                )
    generator.load_state_dict(averaged_generator.module.state_dict())
    discriminator.load_state_dict(averaged_discriminator.module.state_dict())

    _, _, test_dataloader = get_data(batch_size=batch_size, device=device)

    plot(ts, generator, test_dataloader, num_plot_samples, plot_locs)
Exemplo n.º 5
0
def ner_train(args, tokenizer, array, device):
    # do k-fold
    if len(args.fold) == 1: folds = [args.fold[0]]
    else: folds = range(args.fold[0], args.fold[1]+1)

    for fold in folds:
        # new model for each fold
        model = BERTNER(args).to(device)
        print('training start.. on fold', fold)

        if args.avg_steps:
            swa_model = swa_utils.AveragedModel(model, device)
            valid_model = swa_model
            valid_module = swa_model.module
        else:
            valid_model, valid_module = model, model

        # use at
        if args.use_at == 'fgm': fgm = FGM(model)
        elif args.use_at == 'pgd': 
            pgd = PGD(model)
            K = args.pgd_K

        # new tensorized data and maps
        train_data, valid_data = divide_by_type(array, args.num_fold, fold)
        train_loader = ner_tensorize(train_data, tokenizer, args, mode='random')
        valid_loader = ner_tensorize(valid_data, tokenizer, args, mode='seq')
        len_train = len(train_loader)

        # optim
        training_steps = args.max_epoches*len_train
        optimizer, scheduler = get_optimizer_scheduler(args, model, training_steps)

        # training
        stop_ct, best_F1, best_model, best_epoch = 0, 0, None, -1
        train_losses = 0
        start_time = time()
        for i in range(args.max_epoches):
            model.train()

            # use tqdm
            if args.use_tqdm:
                train_iter = tqdm(train_loader, ncols=50)
                valid_iter = tqdm(valid_loader, ncols=50)
                train_iter.set_description('Train')
                valid_iter.set_description('Test')
            else:
                train_iter = train_loader
                valid_iter = valid_loader

            # training process
            for kdx, batch_data in enumerate(train_iter):          
                batch_data = tuple(i.to(device) for i in batch_data)
                ids, masks, _, labels = batch_data

                model.zero_grad()
                loss, logits = model(ids, masks, labels)
                if args.use_at == 'pgd':
                    _, _, ori_F1 = model.calculate_F1([logits], [labels])

                # process loss
                loss.backward()
                train_losses += loss.item()

                # fgm adversial training
                if args.use_at == 'fgm':
                    fgm.attack()
                    loss_adv, _ = model(ids, masks, labels)
                    loss_adv.backward()
                    fgm.restore()

                # pgd adversarial training
                elif args.use_at == 'pgd':
                    pgd.backup_grad()
                    for t in range(K):
                        pgd.attack(is_first_attack=(t==0)) # 在embedding上添加对抗扰动, first attack时备份param.data
                        if t != K-1:
                            model.zero_grad()
                        else:
                            pgd.restore_grad()
                        loss_adv, at_logits = model(ids, masks, labels)
                        _, _, new_F1 = model.calculate_F1([at_logits], [labels])
                        if new_F1 < ori_F1:
                            pgd.restore_grad()
                            loss_adv.backward()
                            break
                        loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度

                    pgd.restore() # restore embedding parameters

                # tackle exploding gradients
                torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=args.max_grad_norm)
                optimizer.step()
                scheduler.step()

            train_losses /= len_train
            
            # evaluate
            steps = max(1, args.avg_steps)
            if (i+1) % steps == 0:
                if args.avg_steps:
                    swa_model.update_parameters(model)
                with torch.no_grad():
                    valid_model.eval()
                    valid_losses = 0
                    pred_logits, pred_labels = [], []
                    for idx, batch_data in enumerate(valid_iter):
                        batch_data = tuple(i.to(device) for i in batch_data)
                        ids, masks, maps, labels = batch_data
                        
                        loss, logits = valid_model(ids, masks, labels)

                        pred_logits.append(logits)
                        pred_labels.append(labels)

                        # process loss
                        valid_losses += loss.item()

                    valid_losses /= len(valid_loader)

                    precision, recall, F1 = valid_module.calculate_F1(pred_logits, pred_labels)
            
                if args.save_models and args.avg_steps > 0:
                    torch.save(valid_module, args.model_dir + '/MOD' + str(fold) + '_' + str(i+1))

                print('Epoch %d train:%.2e valid:%.2e precision:%.4f recall:%.4f F1:%.4f time:%.0f' % \
                    (i+1, train_losses, valid_losses, precision, recall, F1, time()-start_time))
                start_time = time()

                if args.avg_steps == 0:
                    if F1 > best_F1:
                        stop_ct = 0
                        best_F1 = F1
                        best_model = copy.deepcopy(valid_module)
                        best_epoch = i+1
                    else:
                        stop_ct += 1
                        if stop_ct == args.stop_epoches:
                            if args.save_models:
                                torch.save(best_model, args.model_dir + '/MOD' + str(fold) + '_' + str(best_epoch))
                            break

                if i == args.max_epoches-1 and args.save_models:
                    torch.save(best_model, args.model_dir + '/MOD' + str(fold) + '_' + str(best_epoch))