Exemplo n.º 1
0
def main(args):

    train_cfg = config_from_json(args.train_cfg)
    model_cfg = config_from_json(args.model_cfg)
    model_cfg.block_size = model_cfg.max_len // model_cfg.n_blocks
    set_seeds(train_cfg.seed)

    print("Loading dataset")
    loader = PreTrainDataset(args.data_file, train_cfg, model_cfg)

    model = BertInnerForMaskedLM(model_cfg)

    if train_cfg.optimizer == "lamb":
        if train_cfg.opt_level != "" and train_cfg.opt_level is not None:
            optimizer = apex.optimizers.FusedLAMB(
                model.parameters(), **train_cfg.optimizer_parameters)
        else:
            optimizer = torch_optimizer.Lamb(model.parameters(),
                                             **train_cfg.optimizer_parameters)

    elif train_cfg.optimizer == "radam":
        optimizer = torch_optimizer.RAdam(model.parameters(),
                                          **train_cfg.optimizer_parameters)
    else:
        optimizer = optim4GPU(train_cfg, model)

    trainer = Trainer(loader, model, optimizer, args.save_dir, get_device(),
                      train_cfg.parallel, train_cfg.opt_level)

    if args.load_model != "":
        print("Loading checkpoint")
        trainer.load_model(args.load_model, args.load_dataset_state)

    trainer.train(train_cfg)
Exemplo n.º 2
0
def main(args):

    train_cfg = config_from_json(args.train_cfg)
    model_cfg = config_from_json(args.model_cfg)
    model_cfg.block_size = model_cfg.max_len // model_cfg.n_blocks

    set_seeds(train_cfg.seed)

    print("Loading dataset")
    loader = PreTrainDataset(args.data_file, train_cfg, model_cfg)
    model = BertInnerPreTrain(model_cfg)

    if train_cfg.optimizer == "lamb":
        optimizer = torch_optimizer.Lamb(model.parameters(),
                                         lr=train_cfg.lr,
                                         weight_decay=train_cfg.weigth_decay)
    elif train_cfg.optimizer == "radam":
        optimizer = torch_optimizer.RAdam(model.parameters(),
                                          lr=train_cfg.lr,
                                          weight_decay=train_cfg.weigth_decay)
    else:
        optimizer = optim4GPU(train_cfg, model)

    trainer = Trainer(loader, model, optimizer, args.save_dir, get_device(),
                      train_cfg.parallel)

    if args.load_dir != "":
        print("Loading checkpoint")
        trainer.load_model(args.load_dir, args.load_dataset_state)

    trainer.train(train_cfg)
Exemplo n.º 3
0
def get_optimizer(optimizer: str, model, optimizer_args):
    if optimizer == "sgd":
        return torch.optim.SGD(model.parameters(), **optimizer_args)
    elif optimizer == "adam":
        return torch.optim.Adam(model.parameters(), **optimizer_args)
    elif optimizer == "yogi":
        return optim.Yogi(model.parameters(), **optimizer_args)
    elif optimizer == "shampoo":
        return optim.Shampoo(model.parameters(), **optimizer_args)
    elif optimizer == "swats":
        return optim.SWATS(model.parameters(), **optimizer_args)
    elif optimizer == "sgdw":
        return optim.SGDW(model.parameters(), **optimizer_args)
    elif optimizer == "sgdp":
        return optim.SGDP(model.parameters(), **optimizer_args)
    elif optimizer == "rangerva":
        return optim.RangerVA(model.parameters(), **optimizer_args)
    elif optimizer == "rangerqh":
        return optim.RangerQH(model.parameters(), **optimizer_args)
    elif optimizer == "ranger":
        return optim.Ranger(model.parameters(), **optimizer_args)
    elif optimizer == "radam":
        return optim.RAdam(model.parameters(), **optimizer_args)
    elif optimizer == "qhm":
        return optim.QHM(model.parameters(), **optimizer_args)
    elif optimizer == "qhadam":
        return optim.QHAdam(model.parameters(), **optimizer_args)
    elif optimizer == "pid":
        return optim.PID(model.parameters(), **optimizer_args)
    elif optimizer == "novograd":
        return optim.NovoGrad(model.parameters(), **optimizer_args)
    elif optimizer == "lamb":
        return optim.Lamb(model.parameters(), **optimizer_args)
    elif optimizer == "diffgrad":
        return optim.DiffGrad(model.parameters(), **optimizer_args)
    elif optimizer == "apollo":
        return optim.Apollo(model.parameters(), **optimizer_args)
    elif optimizer == "aggmo":
        return optim.AggMo(model.parameters(), **optimizer_args)
    elif optimizer == "adamp":
        return optim.AdamP(model.parameters(), **optimizer_args)
    elif optimizer == "adafactor":
        return optim.Adafactor(model.parameters(), **optimizer_args)
    elif optimizer == "adamod":
        return optim.AdaMod(model.parameters(), **optimizer_args)
    elif optimizer == "adabound":
        return optim.AdaBound(model.parameters(), **optimizer_args)
    elif optimizer == "adabelief":
        return optim.AdaBelief(model.parameters(), **optimizer_args)
    elif optimizer == "accsgd":
        return optim.AccSGD(model.parameters(), **optimizer_args)
    elif optimizer == "a2graduni":
        return optim.A2GradUni(model.parameters(), **optimizer_args)
    elif optimizer == "a2gradinc":
        return optim.A2GradInc(model.parameters(), **optimizer_args)
    elif optimizer == "a2gradexp":
        return optim.A2GradExp(model.parameters(), **optimizer_args)
    else:
        raise Exception(f"Optimizer '{optimizer}' does not exist!")
Exemplo n.º 4
0
    def configure_optimizers(self):
        optimizer = None
        if self.hparams.optimizer == 'lamb':
            optimizer = ptoptim.Lamb(self.parameters(), lr=self.hparams.lr)
        elif self.hparams.optimizer == 'adam':
            optimizer = optim.Adam(self.parameters(), lr=self.hparams.lr)
        elif self.hparams.optimizer == 'adamw':
            optimizer = optim.AdamW(self.parameters(), lr=self.hparams.lr)
        else:
            optimizer = optim.Adam(self.parameters(), lr=self.hparams.lr)

        scheduler = None
        if self.hparams.lr_scheduler == 'cyclic':
            scheduler = optim.lr_scheduler.CyclicLR(optimizer,
                                                    base_lr=self.hparams.lr /
                                                    100.0,
                                                    max_lr=self.hparams.lr,
                                                    mode='triangular2')
        elif self.hparams.lr_scheduler == 'cosine':
            scheduler = optim.lr_scheduler.CosineAnnealingLR(
                optimizer, self.hparams.epochs * 10)
        elif self.hparams.lr_scheduler == 'cosinewr':
            scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(
                optimizer, T_0=10, T_mult=2, eta_min=1e-5)
        elif self.hparams.lr_scheduler == 'plateau':
            mode, monitor = 'min', self.val_loss
            if self.hparams.classify:
                mode, monitor = 'max', self.val_acc
            scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                             threshold=0.001,
                                                             mode=mode)
            scheduler = {
                'scheduler': scheduler,  # The LR scheduler instance (required)
                'interval':
                'epoch',  # The unit of the scheduler's step size, could also be 'step'
                'frequency': 1,  # The frequency of the scheduler
                'monitor':
                monitor,  # Metric for `ReduceLROnPlateau` to monitor
                'strict':
                True,  # Whether to crash the training if `monitor` is not found
                'name': None,  # Custom name for `LearningRateMonitor` to use
            }
        elif self.hparams.lr_scheduler == 'step':
            step_size = getattr(self.hparams, 'step_size', 2)
            n_steps = getattr(self.hparams, 'n_steps', 3)
            step_factor = getattr(self.hparams, 'step_factor', 0.1)
            milestones = list(
                range(step_size, step_size * (n_steps + 1), step_size))
            scheduler = optim.lr_scheduler.MultiStepLR(optimizer,
                                                       milestones=milestones,
                                                       gamma=step_factor)

        if scheduler is None:
            return optimizer
        else:
            return [optimizer], [scheduler]
Exemplo n.º 5
0
def main(args):

    train_cfg = config_from_json(args.train_cfg)
    model_cfg = config_from_json(args.model_cfg)
    model_cfg.block_size = model_cfg.max_len // model_cfg.n_blocks
    set_seeds(train_cfg.seed)

    if model_cfg.projection not in ["dense", "cnn"]:
        if args.max_len == 0:
            model_cfg.reduced_max_len = model_cfg.max_len
        else:
            model_cfg.reduced_max_len = args.max_len
        if args.reduce_block_size:
            assert model_cfg.reduced_max_len % model_cfg.n_blocks == 0, "Reduced len cannot be divided by n_blocks"
            model_cfg.block_size = model_cfg.reduced_max_len // model_cfg.n_blocks
        else:
            assert model_cfg.reduced_max_len % model_cfg.block_size == 0, "Reduced len cannot be divided by initial block_size"
            model_cfg.n_blocks = model_cfg.reduced_max_len // model_cfg.block_size
        print("max_len:", model_cfg.reduced_max_len, "block_size:", model_cfg.block_size, "n_blocks:", model_cfg.n_blocks)
    else:
        if args.max_len != 0:
            warnings.warn("Projection is incompatible with a reduced max len, using default max_len")

    
    print("Loading dataset")
    (data, labels), criterion = get_data_and_optimizer_from_dataset(args.data_file, train_cfg.task)

    loader = GlueDataset(data, labels, train_cfg, model_cfg)
    model = BertInnerForSequenceClassification(model_cfg, loader.get_n_labels(), criterion)

    if train_cfg.optimizer == "lamb":
        if train_cfg.opt_level != "" and train_cfg.opt_level is not None:
            optimizer = apex.optimizers.FusedLAMB(model.parameters(), **train_cfg.optimizer_parameters)
        else:
            optimizer = torch_optimizer.Lamb(model.parameters(), **train_cfg.optimizer_parameters)

    elif train_cfg.optimizer == "radam":
        optimizer = torch_optimizer.RAdam(model.parameters(), **train_cfg.optimizer_parameters)
    elif train_cfg.optimizer == "sgd":
        optimizer = optim.SGD(model.parameters(), **train_cfg.optimizer_parameters)
    else:
        optimizer = optim4GPU(train_cfg, model)

    trainer = GlueTrainer(loader, model, optimizer, args.save_dir, get_device(), train_cfg.parallel)

    if args.load_model != "":
        print("Loading checkpoint")
        trainer.load_model(args.load_model, args.load_dataset_state)

    if not args.eval:
        trainer.train(train_cfg)
    else:
        trainer.eval(train_cfg)
Exemplo n.º 6
0
def create_optimizer(arg, parameters, create_scheduler=False, discrim=False):
    lr = arg.lr_discrim if discrim else arg.lr
    weight_decay = arg.weight_decay_discrim if discrim else arg.weight_decay
    if arg.optimizer == 'Lamb':
        optimizer = optim.Lamb(parameters,
                               lr=lr,
                               weight_decay=weight_decay,
                               betas=(0.5, 0.999))
    elif arg.optimizer == 'AdaBound':
        optimizer = optim.AdaBound(parameters,
                                   lr=lr,
                                   weight_decay=weight_decay,
                                   betas=(0.5, 0.999))
    elif arg.optimizer == 'Yogi':
        optimizer = optim.Yogi(parameters,
                               lr=lr,
                               weight_decay=weight_decay,
                               betas=(0.5, 0.999))
    elif arg.optimizer == 'DiffGrad':
        optimizer = optim.DiffGrad(parameters,
                                   lr=lr,
                                   weight_decay=weight_decay,
                                   betas=(0.5, 0.999))
    elif arg.optimizer == 'Adam':
        optimizer = torch.optim.Adam(parameters,
                                     lr=lr,
                                     weight_decay=weight_decay,
                                     betas=(0.5, 0.999))
    else:
        optimizer = torch.optim.SGD(parameters,
                                    lr=lr,
                                    momentum=arg.momentum,
                                    weight_decay=weight_decay)

    if create_scheduler:
        scheduler = lr_scheduler.ReduceLROnPlateau(optimizer,
                                                   factor=0.2,
                                                   patience=4,
                                                   threshold=1e-2,
                                                   verbose=True)
    else:
        scheduler = None

    return optimizer, scheduler
Exemplo n.º 7
0
def train(args):
    use_cuda = args.num_gpus > 0
    device = torch.device("cuda:0" if use_cuda else "cpu")

    print(device)

    train_loader = _get_train_data_loader(args.batch_size, args.data_dir)
    #model = get_model(args.model_checkpoint, args.num_labels)

    #model = ToyModel()

    model = TextClassifier(2)

    torch.manual_seed(args.seed)
    if use_cuda:
        torch.cuda.manual_seed(args.seed)

    if args.num_gpus > 1:
        model = torch.nn.DataParallel(model)
        print('data parallel model')
        model.cuda()

    # Maybe use different optimizer????
    optimizer = optim.Lamb(model.parameters(),
                           lr=args.lr,
                           betas=(0.9, 0.999),
                           eps=args.epsilon,
                           weight_decay=args.weight_decay)

    # Maybe use different loss function
    loss_fn = nn.CrossEntropyLoss().to(device)

    for epoch in range(1, args.epochs + 1):
        model.train()

        for step, batch in enumerate(train_loader):
            b_input_ids = batch['input_ids'].to(device)
            #b_input_ids = batch['input_ids'].type(torch.FloatTensor).to(device)
            b_input_mask = batch['attention_mask'].to(device)
            b_labels = batch['targets'].to(device)

            outputs = model(b_input_ids, attention_mask=b_input_mask)
            #outputs = model(b_input_ids)

            #loss = loss_fn(outputs.logits, b_labels)
            loss = loss_fn(outputs[:, -1], b_labels)
            #loss = loss_fn(outputs, b_labels)

            loss.backward()
            #torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) ?????
            optimizer.step()
            optimizer.zero_grad()

            if args.verbose:
                print('Batch', step)
                if step % 100 == 0:
                    print('Batch', step)

    #if args.num_gpus > 1:
    #    model.module.save_pretrained(args.model_dir)
    #else:
    #    model.save_pretrained(args.model_dir)

    eval_loader = _get_eval_data_loader(args.test_batch_size, args.data_dir)
    test(model, eval_loader, device)
Exemplo n.º 8
0
def build_optimizer(cfg, model):
    name_optimizer = cfg.optimizer.type
    optimizer = None

    if name_optimizer == 'A2GradExp':
        optimizer = optim.A2GradExp(model.parameters(), lr=cfg.optimizer.lr)
    elif name_optimizer == 'A2GradInc':
        optimizer = optim.A2GradInc(model.parameters(), lr=cfg.optimizer.lr)
    elif name_optimizer == 'A2GradUni':
        optimizer = optim.A2GradUni(model.parameters(), lr=cfg.optimizer.lr)
    elif name_optimizer == 'AccSGD':
        optimizer = optim.AccSGD(model.parameters(), lr=cfg.optimizer.lr)
    elif name_optimizer == 'AdaBelief':
        optimizer = optim.AdaBelief(model.parameters(), lr=cfg.optimizer.lr)
    elif name_optimizer == 'AdaBound':
        optimizer = optim.AdaBound(model.parameters(), lr=cfg.optimizer.lr)
    elif name_optimizer == 'AdaMod':
        optimizer = optim.AdaMod(model.parameters(), lr=cfg.optimizer.lr)
    elif name_optimizer == 'Adafactor':
        optimizer = optim.Adafactor(model.parameters(), lr=cfg.optimizer.lr)
    elif name_optimizer == 'AdamP':
        optimizer = optim.AdamP(model.parameters(), lr=cfg.optimizer.lr)
    elif name_optimizer == 'AggMo':
        optimizer = optim.AggMo(model.parameters(), lr=cfg.optimizer.lr)
    elif name_optimizer == 'Apollo':
        optimizer = optim.Apollo(model.parameters(), lr=cfg.optimizer.lr)
    elif name_optimizer == 'DiffGrad':
        optimizer = optim.DiffGrad(model.parameters(), lr=cfg.optimizer.lr)
    elif name_optimizer == 'Lamb':
        optimizer = optim.Lamb(model.parameters(), lr=cfg.optimizer.lr)
    elif name_optimizer == 'Lookahead':
        yogi = optim.Yogi(model.parameters(), lr=cfg.optimizer.lr)
        optimizer = optim.Lookahead(yogi, k=5, alpha=0.5)
    elif name_optimizer == 'NovoGrad':
        optimizer = optim.NovoGrad(model.parameters(), lr=cfg.optimizer.lr)
    elif name_optimizer == 'PID':
        optimizer = optim.PID(model.parameters(), lr=cfg.optimizer.lr)
    elif name_optimizer == 'QHAdam':
        optimizer = optim.QHAdam(model.parameters(), lr=cfg.optimizer.lr)
    elif name_optimizer == 'QHM':
        optimizer = optim.QHM(model.parameters(), lr=cfg.optimizer.lr)
    elif name_optimizer == 'RAdam':
        optimizer = optim.RAdam(model.parameters(), lr=cfg.optimizer.lr)
    elif name_optimizer == 'Ranger':
        optimizer = optim.Ranger(model.parameters(), lr=cfg.optimizer.lr)
    elif name_optimizer == 'RangerQH':
        optimizer = optim.RangerQH(model.parameters(), lr=cfg.optimizer.lr)
    elif name_optimizer == 'RangerVA':
        optimizer = optim.RangerVA(model.parameters(), lr=cfg.optimizer.lr)
    elif name_optimizer == 'SGDP':
        optimizer = optim.SGDP(model.parameters(), lr=cfg.optimizer.lr)
    elif name_optimizer == 'SGDW':
        optimizer = optim.SGDW(model.parameters(), lr=cfg.optimizer.lr)
    elif name_optimizer == 'SWATS':
        optimizer = optim.SWATS(model.parameters(), lr=cfg.optimizer.lr)
    elif name_optimizer == 'Shampoo':
        optimizer = optim.Shampoo(model.parameters(), lr=cfg.optimizer.lr)
    elif name_optimizer == 'Yogi':
        optimizer = optim.Yogi(model.parameters(), lr=cfg.optimizer.lr)
    elif name_optimizer == 'Adam':
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=cfg.optimizer.lr,
                                     weight_decay=cfg.optimizer.weight_decay)
    elif name_optimizer == 'SGD':
        optimizer = torch.optim.SGD(model.parameters(),
                                    lr=cfg.optimizer.lr,
                                    momentum=cfg.optimizer.momentum,
                                    weight_decay=cfg.optimizer.weight_decay)
    if optimizer is None:
        raise Exception('optimizer is wrong')
    return optimizer