def get_optimizer(self, lr, t_total, schedule_type='warmup_linear', optimizer_type='lamb'): SCHEDULES = { None: ConstantLRSchedule, "none": ConstantLRSchedule, "warmup_cosine": WarmupCosineSchedule, "warmup_constant": WarmupConstantSchedule, "warmup_linear": WarmupLinearSchedule, "warmup_cosine_hard_restarts": WarmupCosineWithHardRestartsSchedule } # Prepare optimiser and schedule no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': self.weight_decay }, {'params': [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if optimizer_type == 'lamb': optimizer = Lamb(optimizer_grouped_parameters, lr=lr, eps=self.adam_epsilon) elif optimizer_type == 'adamw': optimizer = AdamW(optimizer_grouped_parameters, lr=lr, eps=self.adam_epsilon) schedule_class = SCHEDULES[schedule_type] scheduler = schedule_class(optimizer, warmup_steps=self.warmup_steps, t_total=t_total) return optimizer, scheduler
def get_optimizer(model, lr=5e-5, decoy=0.01, no_decay=('bias', 'layer_norm', 'LayerNorm'), optimizer_name='bert', param_optimizer=None): if param_optimizer is None: param_optimizer = list(model.named_parameters()) param_optimizer = [n for n in param_optimizer if 'pool' not in n[0]] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if _do_use_weight_decay(n, no_decay)], 'weight_decay': decoy }, { 'params': [ p for n, p in param_optimizer if not _do_use_weight_decay(n, no_decay) ], 'weight_decay': 0.0 }] if optimizer_name == 'lamb': try: from pytorch_lamb import Lamb return Lamb(optimizer_grouped_parameters, lr=lr) except ImportError: pass elif optimizer_name == 'adamw': return AdamW(optimizer_grouped_parameters, lr=lr) return BertAdam(optimizer_grouped_parameters, lr=lr)
def get_optimizer(self, lr, optimizer_type="lamb"): # Prepare optimiser and schedule no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": self.weight_decay, }, { "params": [ p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] if optimizer_type == "lamb": optimizer = Lamb(optimizer_grouped_parameters, lr=lr, eps=self.adam_epsilon) elif optimizer_type == "adamw": optimizer = AdamW(optimizer_grouped_parameters, lr=lr, eps=self.adam_epsilon) return optimizer
def return_optim(model, optim_type, lr): if optim_type == "adam": return optim.Adam(model.parameters(), lr=lr) elif optim_type == "radam": return toptim.RAdam(model.parameters(), lr=lr) elif optim_type == "lamb": return Lamb(model.parameters(), lr=lr) else: raise ValueError("Invalid optimizer type")
def get_optimizer(net_conf, model): if net_conf["optimizer"] == "adam": optimizer = { "generator": optim.Adam(model["G"].parameters(), lr=net_conf["lr"]), "discriminator": optim.Adam(model["D"].parameters(), lr=net_conf["discriminator_lr"]), } elif net_conf["optimizer"] == "radam": optimizer = { "generator": toptim.RAdam(model["G"].parameters(), lr=net_conf["lr"]), "discriminator": toptim.RAdam(model["D"].parameters(), lr=net_conf["discriminator_lr"]), } elif net_conf["optimizer"] == "lamb": optimizer = { "generator": Lamb( model["G"].parameters(), lr=net_conf["lr"], weight_decay=0.01, betas=(0.9, 0.999), adam=False, ), "discriminator": Lamb( model["D"].parameters(), lr=net_conf["lr"], weight_decay=0.01, betas=(0.9, 0.999), adam=False, ), } else: raise ValueError("optimizer must be [adam, radam, lamb]") return optimizer
def configure_optimizers(self): no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in self.generator.named_parameters() if not any(nd in n for nd in no_decay) ] + [ p for n, p in self.discriminator.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": self.config.weight_decay, }, { "params": [ p for n, p in self.generator.named_parameters() if any(nd in n for nd in no_decay) ] + [ p for n, p in self.discriminator.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] t_total = self.config.num_steps optimizer = Lamb(optimizer_grouped_parameters, lr=self.config.learning_rate, eps=self.config.epsilon) if self.config.lr_schedule == 'linear': scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=self.config.warmup_steps, num_training_steps=t_total) elif self.config.lr_schedule == 'cosine': scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=self.config.warmup_steps, num_training_steps=t_total) elif self.config.lr_schedule == 'constant': scheduler = get_constant_schedule_with_warmup( optimizer, num_warmup_steps=self.config.warmup_steps) scheduler_config = {'scheduler': scheduler, 'interval': 'step'} return [optimizer], [scheduler_config]
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--optimizer', type=str, default='lamb', choices=['lamb', 'adam'], help='which optimizer to use') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=6, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--lr', type=float, default=0.0025, metavar='LR', help='learning rate (default: 0.0025)') parser.add_argument('--wd', type=float, default=0.01, metavar='WD', help='weight decay (default: 0.01)') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument('--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') args = parser.parse_args() use_cuda = torch.cuda.is_available() torch.manual_seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} train_loader = torch.utils.data.DataLoader( datasets.MNIST('../data', train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])), batch_size=args.batch_size, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader( datasets.MNIST('../data', train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])), batch_size=args.test_batch_size, shuffle=True, **kwargs) model = Net().to(device) optimizer = Lamb(model.parameters(), lr=args.lr, weight_decay=args.wd, betas=(.9, .999), adam=(args.optimizer == 'adam')) writer = SummaryWriter() for epoch in range(1, args.epochs + 1): train(args, model, device, train_loader, optimizer, epoch, writer) test(args, model, device, test_loader, writer, epoch)
def get_optimizer(self, lr, optimizer_type='lamb'): # Prepare optimiser and schedule no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': self.weight_decay }, {'params': [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if optimizer_type == 'lamb': optimizer = Lamb(optimizer_grouped_parameters, lr=lr, eps=self.adam_epsilon) elif optimizer_type == 'adamw': optimizer = AdamW(optimizer_grouped_parameters, lr=lr, eps=self.adam_epsilon) return optimizer
def optimizer_setup(state): model = state.model if state.args.optim.lower() == 'sgd': optimizer = optim.SGD(g.state.model.parameters(), lr=state.args.lr, momentum=state.args.mom) elif state.args.optim.lower() == 'lamb': optimizer = Lamb(model.parameters(), lr=state.args.lr, weight_decay=state.args.wd) else: assert state.args.optim.lower() == 'adam' optimizer = optim.Adam(model.parameters(), lr=state.args.lr, weight_decay=state.args.wd) state.optimizer = optimizer
def optimizer_setup(state): model = state.model if state.args.optim.lower() == 'sgd': optimizer = optim.SGD(g.state.model.parameters(), lr=state.args.lr, momentum=state.args.mom) elif state.args.optim.lower() == 'lamb': optimizer = Lamb(model.parameters(), lr=state.args.lr, weight_decay=state.args.wd) else: assert state.args.optim.lower() == 'adam' optimizer = optim.Adam(model.parameters(), lr=state.args.lr, weight_decay=state.args.wd) state.optimizer = optimizer if state.args.fp16: state.model = FP16_Module(state.model) state.optimizer = FP16_Optimizer(state.optimizer, static_loss_scale=state.args.static_loss_scale, dynamic_loss_scale=state.args.dynamic_loss_scale, dynamic_loss_args={'init_scale': 2 ** 16}, verbose=False)
def init_optimizer(args, model, criterion): params = list(model.parameters()) + list(criterion.parameters()) total_params = sum(x.size()[0] * x.size()[1] if len(x.size()) > 1 else x.size()[0] for x in params if x.size()) print('Args:', args) print('Model total parameters:', total_params) optimizer = None # Ensure the optimizer is optimizing params, which includes both the model's weights as well as the criterion's weight (i.e. Adaptive Softmax) if args.optimizer == 'sgd': optimizer = torch.optim.SGD(params, lr=args.lr, weight_decay=args.wdecay) if args.optimizer == 'adagrad': optimizer = torch.optim.Adagrad(params, lr=args.lr, weight_decay=args.wdecay) if args.optimizer == 'adam': optimizer = torch.optim.Adam(params, lr=args.lr, weight_decay=args.wdecay) if args.optimizer == 'adamw': optimizer = torch.optim.AdamW(params, lr=args.lr, weight_decay=args.wdecay) if args.optimizer == 'lamb': from pytorch_lamb import Lamb optimizer = Lamb(params, lr=args.lr, weight_decay=args.wdecay, min_trust=0.25) # optimizer = Lamb(params, lr=args.lr, weight_decay=args.wdecay, min_trust=0.1) # optimizer = Lamb(params, lr=args.lr, weight_decay=args.wdecay, min_trust=0, random_min_trust=0.2, random_trust_dice=10) # optimizer = Lamb(params, lr=args.lr, weight_decay=args.wdecay, min_trust=0.2, random_min_trust=0.5, random_trust_dice=4) from lookahead import Lookahead if False: k, alpha = 5, 0.8 print('Lookahead - k {} and alpha {}'.format(k, alpha)) optimizer = Lookahead(base_optimizer=optimizer, k=k, alpha=alpha) model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # model, optimizer = amp.initialize(model, optimizer, opt_level='O2') return model, optimizer, params
def get_optimizer(self, lr, t_total, schedule_type='warmup_linear', optimizer_type='lamb'): # Prepare optimiser and schedule no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': self.weight_decay }, { 'params': [ p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] if optimizer_type == 'lamb': optimizer = Lamb(optimizer_grouped_parameters, lr=lr, eps=self.adam_epsilon) elif optimizer_type == 'adamw': optimizer = AdamW(optimizer_grouped_parameters, lr=lr, eps=self.adam_epsilon) schedule_class = SCHEDULES[schedule_type] scheduler = schedule_class(optimizer, warmup_steps=self.warmup_steps, t_total=t_total) return optimizer, scheduler
def train(args): """Train E2E VC model.""" set_deterministic_pytorch(args) # check cuda availability if not torch.cuda.is_available(): logging.warning("cuda is not available") # get input and output dimension info with open(args.valid_json, "rb") as f: valid_json = json.load(f)["utts"] utts = list(valid_json.keys()) # In TTS, this is reversed, but not in VC. See `espnet.utils.training.batchfy` idim = int(valid_json[utts[0]]["input"][0]["shape"][1]) odim = int(valid_json[utts[0]]["output"][0]["shape"][1]) logging.info("#input dims : " + str(idim)) logging.info("#output dims: " + str(odim)) # get extra input and output dimenstion if args.use_speaker_embedding: args.spk_embed_dim = int(valid_json[utts[0]]["input"][1]["shape"][0]) else: args.spk_embed_dim = None if args.use_second_target: args.spc_dim = int(valid_json[utts[0]]["input"][1]["shape"][1]) else: args.spc_dim = None # write model config if not os.path.exists(args.outdir): os.makedirs(args.outdir) model_conf = args.outdir + "/model.json" with open(model_conf, "wb") as f: logging.info("writing a model config file to" + model_conf) f.write( json.dumps( (idim, odim, vars(args)), indent=4, ensure_ascii=False, sort_keys=True ).encode("utf_8") ) for key in sorted(vars(args).keys()): logging.info("ARGS: " + key + ": " + str(vars(args)[key])) # specify model architecture if args.enc_init is not None or args.dec_init is not None: model = load_trained_modules(idim, odim, args, TTSInterface) else: model_class = dynamic_import(args.model_module) model = model_class(idim, odim, args) assert isinstance(model, TTSInterface) logging.info(model) reporter = model.reporter # freeze modules, if specified if args.freeze_mods: for mod, param in model.named_parameters(): if any(mod.startswith(key) for key in args.freeze_mods): logging.info("freezing %s" % mod) param.requires_grad = False for mod, param in model.named_parameters(): if not param.requires_grad: logging.info("Frozen module %s" % mod) # check the use of multi-gpu if args.ngpu > 1: model = torch.nn.DataParallel(model, device_ids=list(range(args.ngpu))) if args.batch_size != 0: logging.warning( "batch size is automatically increased (%d -> %d)" % (args.batch_size, args.batch_size * args.ngpu) ) args.batch_size *= args.ngpu # set torch device device = torch.device("cuda" if args.ngpu > 0 else "cpu") model = model.to(device) logging.warning( "num. model params: {:,} (num. trained: {:,} ({:.1f}%))".format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), sum(p.numel() for p in model.parameters() if p.requires_grad) * 100.0 / sum(p.numel() for p in model.parameters()), ) ) # Setup an optimizer if args.opt == "adam": optimizer = torch.optim.Adam( model.parameters(), args.lr, eps=args.eps, weight_decay=args.weight_decay ) elif args.opt == "noam": from espnet.nets.pytorch_backend.transformer.optimizer import get_std_opt optimizer = get_std_opt( model, args.adim, args.transformer_warmup_steps, args.transformer_lr ) elif args.opt == "lamb": from pytorch_lamb import Lamb optimizer = Lamb( model.parameters(), lr=args.lr, weight_decay=0.01, betas=(0.9, 0.999) ) else: raise NotImplementedError("unknown optimizer: " + args.opt) # FIXME: TOO DIRTY HACK setattr(optimizer, "target", reporter) setattr(optimizer, "serialize", lambda s: reporter.serialize(s)) # read json data with open(args.train_json, "rb") as f: train_json = json.load(f)["utts"] with open(args.valid_json, "rb") as f: valid_json = json.load(f)["utts"] use_sortagrad = args.sortagrad == -1 or args.sortagrad > 0 if use_sortagrad: args.batch_sort_key = "input" # make minibatch list (variable length) train_batchset = make_batchset( train_json, args.batch_size, args.maxlen_in, args.maxlen_out, args.minibatches, batch_sort_key=args.batch_sort_key, min_batch_size=args.ngpu if args.ngpu > 1 else 1, shortest_first=use_sortagrad, count=args.batch_count, batch_bins=args.batch_bins, batch_frames_in=args.batch_frames_in, batch_frames_out=args.batch_frames_out, batch_frames_inout=args.batch_frames_inout, swap_io=False, iaxis=0, oaxis=0, ) valid_batchset = make_batchset( valid_json, args.batch_size, args.maxlen_in, args.maxlen_out, args.minibatches, batch_sort_key=args.batch_sort_key, min_batch_size=args.ngpu if args.ngpu > 1 else 1, count=args.batch_count, batch_bins=args.batch_bins, batch_frames_in=args.batch_frames_in, batch_frames_out=args.batch_frames_out, batch_frames_inout=args.batch_frames_inout, swap_io=False, iaxis=0, oaxis=0, ) load_tr = LoadInputsAndTargets( mode="vc", use_speaker_embedding=args.use_speaker_embedding, use_second_target=args.use_second_target, preprocess_conf=args.preprocess_conf, preprocess_args={"train": True}, # Switch the mode of preprocessing keep_all_data_on_mem=args.keep_all_data_on_mem, ) load_cv = LoadInputsAndTargets( mode="vc", use_speaker_embedding=args.use_speaker_embedding, use_second_target=args.use_second_target, preprocess_conf=args.preprocess_conf, preprocess_args={"train": False}, # Switch the mode of preprocessing keep_all_data_on_mem=args.keep_all_data_on_mem, ) converter = CustomConverter() # hack to make batchsize argument as 1 # actual bathsize is included in a list train_iter = { "main": ChainerDataLoader( dataset=TransformDataset( train_batchset, lambda data: converter([load_tr(data)]) ), batch_size=1, num_workers=args.num_iter_processes, shuffle=not use_sortagrad, collate_fn=lambda x: x[0], ) } valid_iter = { "main": ChainerDataLoader( dataset=TransformDataset( valid_batchset, lambda data: converter([load_cv(data)]) ), batch_size=1, shuffle=False, collate_fn=lambda x: x[0], num_workers=args.num_iter_processes, ) } # Set up a trainer updater = CustomUpdater( model, args.grad_clip, train_iter, optimizer, device, args.accum_grad ) trainer = training.Trainer(updater, (args.epochs, "epoch"), out=args.outdir) # Resume from a snapshot if args.resume: logging.info("resumed from %s" % args.resume) torch_resume(args.resume, trainer) # set intervals eval_interval = (args.eval_interval_epochs, "epoch") save_interval = (args.save_interval_epochs, "epoch") report_interval = (args.report_interval_iters, "iteration") # Evaluate the model with the test dataset for each epoch trainer.extend( CustomEvaluator(model, valid_iter, reporter, device), trigger=eval_interval ) # Save snapshot for each epoch trainer.extend(torch_snapshot(), trigger=save_interval) # Save best models trainer.extend( snapshot_object(model, "model.loss.best"), trigger=training.triggers.MinValueTrigger( "validation/main/loss", trigger=eval_interval ), ) # Save attention figure for each epoch if args.num_save_attention > 0: data = sorted( list(valid_json.items())[: args.num_save_attention], key=lambda x: int(x[1]["input"][0]["shape"][1]), reverse=True, ) if hasattr(model, "module"): att_vis_fn = model.module.calculate_all_attentions plot_class = model.module.attention_plot_class else: att_vis_fn = model.calculate_all_attentions plot_class = model.attention_plot_class att_reporter = plot_class( att_vis_fn, data, args.outdir + "/att_ws", converter=converter, transform=load_cv, device=device, reverse=True, ) trainer.extend(att_reporter, trigger=eval_interval) else: att_reporter = None # Make a plot for training and validation values if hasattr(model, "module"): base_plot_keys = model.module.base_plot_keys else: base_plot_keys = model.base_plot_keys plot_keys = [] for key in base_plot_keys: plot_key = ["main/" + key, "validation/main/" + key] trainer.extend( extensions.PlotReport(plot_key, "epoch", file_name=key + ".png"), trigger=eval_interval, ) plot_keys += plot_key trainer.extend( extensions.PlotReport(plot_keys, "epoch", file_name="all_loss.png"), trigger=eval_interval, ) # Write a log of evaluation statistics for each epoch trainer.extend(extensions.LogReport(trigger=report_interval)) report_keys = ["epoch", "iteration", "elapsed_time"] + plot_keys trainer.extend(extensions.PrintReport(report_keys), trigger=report_interval) trainer.extend(extensions.ProgressBar(), trigger=report_interval) set_early_stop(trainer, args) if args.tensorboard_dir is not None and args.tensorboard_dir != "": from torch.utils.tensorboard import SummaryWriter writer = SummaryWriter(args.tensorboard_dir) trainer.extend(TensorboardLogger(writer, att_reporter), trigger=report_interval) if use_sortagrad: trainer.extend( ShufflingEnabler([train_iter]), trigger=(args.sortagrad if args.sortagrad != -1 else args.epochs, "epoch"), ) # Run the training trainer.run() check_early_stop(trainer, args.epochs)
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), ])) print(args.batch_size) dataloader = torch.utils.data.DataLoader(cifar10, batch_size=args.batch_size, shuffle=True, num_workers=4, drop_last=True) net_G = Generator().to(device) net_D = Discriminator().to(device) optim_G = Lamb(net_G.parameters(), lr=args.lr, weight_decay=args.wd, betas=(.5, 0.999), adam=True) optim_D = Lamb(net_D.parameters(), lr=args.lr, weight_decay=args.wd, betas=(.5, 0.999), adam=True) train_writer = SummaryWriter(os.path.join(log_dir, 'train')) valid_writer = SummaryWriter(os.path.join(log_dir, 'valid')) real_label = torch.full((args.batch_size, 1), 1).to(device) fake_label = torch.full((args.batch_size, 1), 0).to(device) label = torch.cat([real_label, fake_label], dim=0) criteria = nn.BCEWithLogitsLoss()
def main(): global global_token_count, event_writer, train_step, train_loss, last_log_step, \ best_val_loss, epoch, model if args.local_rank > 0: pass # skip shutdown when rank is explicitly set + not zero rank else: os.system('shutdown -c') if not args.local: logger.info( f'Distributed initializing process group with {args.dist_backend}, {args.dist_url}, {util.get_world_size()}' ) dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=util.get_world_size()) assert (util.get_world_size() == dist.get_world_size()) logger.info( f"Distributed: success ({args.local_rank}/{dist.get_world_size()})" ) model = MemTransformerLM(ntokens, args.n_layer, args.n_head, args.d_model, args.d_head, args.d_inner, args.dropout, args.dropatt, tie_weight=args.tied, d_embed=args.d_embed, div_val=args.div_val, tie_projs=tie_projs, pre_lnorm=args.pre_lnorm, tgt_len=args.tgt_len, ext_len=args.ext_len, mem_len=args.mem_len, cutoffs=cutoffs, same_length=args.same_length, attn_type=args.attn_type, clamp_len=args.clamp_len, sample_softmax=args.sample_softmax) # log model info n_all_param = sum([p.nelement() for p in model.parameters()]) log_tb('sizes/params', n_all_param) n_nonemb_param = sum([p.nelement() for p in model.layers.parameters()]) log_tb('sizes/non_emb_params', n_nonemb_param) logger.info('params %s non_emb_params %s', n_all_param, n_nonemb_param) # optimizer if args.optim.lower() == 'sgd': optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.mom) elif args.optim.lower() == 'lamb': optimizer = Lamb(model.parameters(), lr=args.lr, weight_decay=args.wd) else: assert args.optim.lower() == 'adam' optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd) # scheduler if args.scheduler == 'cosine': # Divide by 1e6 for numerical stability. scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, args.max_tokens // 1e6, eta_min=args.eta_min) elif args.scheduler == 'finder': scheduler = LRFinder(optimizer, args.max_tokens, init_value=args.lr / 1e3) elif args.scheduler == 'constant': pass model.apply(weights_init) model.word_emb.apply( weights_init ) # ensure embedding init is not overridden by out_layer in case of weight sharing if args.checkpoint: if global_rank == 0: util.restore_from_checkpoint(model=model, checkpoint_fn=args.checkpoint) model = model.to(device) if args.fp16: model = FP16_Module(model) optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.static_loss_scale, dynamic_loss_scale=args.dynamic_loss_scale, dynamic_loss_args={'init_scale': 2**16}, verbose=False) if args.local: model = nn.DataParallel(model, dim=1) else: # Uncomment find_unused_parameters and upgrade to torch 1.1 for adaptive embedding. model = DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) #, find_unused_parameters=True) if global_rank == 0: event_writer = SummaryWriter(args.logdir) event_writer.add_text('args', str(args)) # test checkpoint writing if args.checkpoint_each_epoch: logger.info(f'Saving checkpoint for epoch {epoch}') util.dist_save_checkpoint(model, optimizer, args.logdir, suffix=f'{0}') # Loop over epochs. train_step = 0 train_loss = 0 last_log_step = 0 best_val_loss = None va_iter, te_iter = [ corpus.get_dist_iterator(split, global_rank, max_rank, args.batch_size * 2, args.tgt_len, device=device, ext_len=args.ext_len) for split in ('valid', 'test') ] # At any point you can hit Ctrl + C to break out of training early. try: for epoch in itertools.count(start=1): train(va_iter, optimizer, scheduler) except KeyboardInterrupt: logger.info('-' * 100) logger.info('Exiting from training early') except StopIteration: pass # Eval one more time. evaluate_and_log(optimizer, va_iter, 'val', train_step=-1) # Load the best saved model. logger.info("Loading best checkpoint") model_file = os.path.join(args.logdir, 'model-best.pt') if os.path.exists(model_file): with open(model_file, 'rb') as model_f: with timeit('load'): if args.local: model = torch.load(model_f) else: model = torch.load(model_f, map_location=lambda storage, loc: storage.cuda(args.local_rank)) model = DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) else: logger.warn('no model file, using current model for loss') # Run on test data. evaluate_and_log(optimizer, te_iter, 'test', -1)
0.01 }, { "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0 }] if args["adam"]: optimizer = BertAdam(optimizer_group_parameters, lr=args["learning_rate"], warmup=args["warmup_proportion"], t_total=num_train_steps) else: optimizer = Lamb(optimizer_group_parameters, lr=args["learning_rate"], weight_decay=args["warmup_proportion"]) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args["do_train"]: train_features = convert_examples_to_features(train_examples, label_list, args["max_seq_length"], tokenizer) logger.info("******** Running training ********") logger.info(" Number of examples = %d", len(train_examples)) logger.info(" Batch size = %d", args["train_batch_size"]) logger.info(" Num steps = %d", num_train_steps)
def __init__(self, opt): """Initialize the CycleGAN class. Parameters: opt (Option class)-- stores all the experiment flags; needs to be a subclass of BaseOptions """ BaseModel.__init__(self, opt) # specify the training losses you want to print out. The training/test scripts will call <BaseModel.get_current_losses> self.loss_names = [ 'D_A', 'G_A', 'cycle_A', 'idt_A', 'D_B', 'G_B', 'cycle_B', 'idt_B' ] # specify the images you want to save/display. The training/test scripts will call <BaseModel.get_current_visuals> visual_names_A = ['real_A', 'fake_B', 'rec_A'] visual_names_B = ['real_B', 'fake_A', 'rec_B'] if self.isTrain and self.opt.lambda_identity > 0.0: # if identity loss is used, we also visualize idt_B=G_A(B) ad idt_A=G_A(B) visual_names_A.append('idt_B') visual_names_B.append('idt_A') self.visual_names = visual_names_A + visual_names_B # combine visualizations for A and B # specify the models you want to save to the disk. The training/test scripts will call <BaseModel.save_networks> and <BaseModel.load_networks>. if self.isTrain: self.model_names = ['G_A', 'G_B', 'D_A', 'D_B'] else: # during test time, only load Gs self.model_names = ['G_A', 'G_B'] # define networks (both Generators and discriminators) # The naming is different from those used in the paper. # Code (vs. paper): G_A (G), G_B (F), D_A (D_Y), D_B (D_X) self.netG_A = networks.define_G(opt.input_nc, opt.output_nc, opt.ngf, opt.netG, opt.norm, not opt.no_dropout, opt.init_type, opt.init_gain, self.gpu_ids) self.netG_B = networks.define_G(opt.output_nc, opt.input_nc, opt.ngf, opt.netG, opt.norm, not opt.no_dropout, opt.init_type, opt.init_gain, self.gpu_ids) if self.isTrain: # define discriminators self.netD_A = networks.define_D(opt.output_nc, opt.ndf, opt.netD, opt.n_layers_D, opt.norm, opt.init_type, opt.init_gain, self.gpu_ids) self.netD_B = networks.define_D(opt.input_nc, opt.ndf, opt.netD, opt.n_layers_D, opt.norm, opt.init_type, opt.init_gain, self.gpu_ids) if self.isTrain: if opt.lambda_identity > 0.0: # only works when input and output images have the same number of channels assert (opt.input_nc == opt.output_nc) self.fake_A_pool = ImagePool( opt.pool_size ) # create image buffer to store previously generated images self.fake_B_pool = ImagePool( opt.pool_size ) # create image buffer to store previously generated images # define loss functions self.criterionGAN = networks.GANLoss(opt.gan_mode).to( self.device) # define GAN loss. self.criterionCycle = torch.nn.L1Loss() self.criterionIdt = torch.nn.L1Loss() # initialize optimizers; schedulers will be automatically created by function <BaseModel.setup>. #self.optimizer_G = torch.optim.Adam(itertools.chain(self.netG_A.parameters(), self.netG_B.parameters()), lr=opt.lr, betas=(opt.beta1, 0.999)) #self.optimizer_D = torch.optim.Adam(itertools.chain(self.netD_A.parameters(), self.netD_B.parameters()), lr=opt.lr, betas=(opt.beta1, 0.999)) self.optimizer_G = Lamb(itertools.chain(self.netG_A.parameters(), self.netG_B.parameters()), lr=opt.lr, weight_decay=opt.wd, adam=(opt.optimizer == 'adam'), betas=(opt.beta1, 0.999)) self.optimizer_D = Lamb(itertools.chain(self.netD_A.parameters(), self.netD_B.parameters()), lr=opt.lr, weight_decay=opt.wd, adam=(opt.optimizer == 'adam'), betas=(opt.beta1, 0.999)) self.optimizers.append(self.optimizer_G) self.optimizers.append(self.optimizer_D)
def run_finetuning(args): torch.manual_seed(args.seed) device = torch.device( 'cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu') # Get text columns t_columns = args.text_columns.split(',') num_texts = len(t_columns) if num_texts == 1: t_columns = t_columns[0] # Get label columns l_columns = args.label_columns.split(',') num_labels = len(l_columns) if num_labels == 1: l_columns = l_columns[0] if args.fp16 and not APEX_AVAILABLE: print( "FP16 toggle is on but Apex is not available. Using FP32 training." ) if args.do_train: # Configure tokenizer tokenizer = AutoTokenizer.from_pretrained(args.pretrained) if args.add_token != '': add_token = { 'additional_special_tokens': args.add_token.split(',') } added = tokenizer.add_special_tokens(add_token) print('\n' + '=' * 50, '\nCONFIGURE FINETUNING SETUP', '\n' + '=' * 50) if args.add_token != '': print("Addded {} special tokens:".format(added), args.add_token) # Produce hash code for cache f_string = args.train_data + args.valid_data + str(args.msl) + str( args.seed) + args.pretrained + str(args.data_pct) hashed = 'cache_' + hashlib.md5(f_string.encode()).hexdigest() + '.pt' # Produce the dataset if cache doesn't exist if hashed not in os.listdir() or args.retokenize_data: print("Producing dataset cache. This will take a while.") s = time.time() df = pd.read_csv(args.train_data, lineterminator='\n').sample( frac=args.data_pct, random_state=args.seed) text, labels = df[t_columns].values, df[l_columns].values train_dataset = process_data(text, labels, tokenizer, msl=args.msl) df = pd.read_csv(args.valid_data, lineterminator='\n') text, labels = df[t_columns].values, df[l_columns].values valid_dataset = process_data(text, labels, tokenizer, msl=args.msl) if args.save_cache: print('Saving data cache') with open(hashed, 'wb') as f: torch.save([train_dataset, valid_dataset], f) print("Preprocessing finished. Time elapsed: {:.2f}s".format( time.time() - s)) # Load the dataset if the cache exists else: print('Cache found. Loading training and validation data.') with open(hashed, 'rb') as f: train_dataset, valid_dataset = torch.load(f) # Produce dataloaders train_sampler = data.RandomSampler(train_dataset) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, sampler=train_sampler) valid_loader = data.DataLoader(valid_dataset, batch_size=args.batch_size, shuffle=False) # Configure model config = AutoConfig.from_pretrained( args.pretrained, num_labels=2 if num_labels == 1 else num_labels) if args.random_init: print( "Initializing new randomly-initialized model from configuration" ) model = AutoModelForSequenceClassification.from_config(config) else: print("Loading from pretrained checkpoint") model = AutoModelForSequenceClassification.from_pretrained( args.pretrained, config=config) _ = model.resize_token_embeddings(len(tokenizer)) model = model.to(device) print("Model has {:,} trainable parameters".format( sum(p.numel() for p in model.parameters() if p.requires_grad))) # Configure loss function criterion = torch.nn.CrossEntropyLoss( ) if num_labels == 1 else torch.nn.BCEWithLogitsLoss() # Configure optimizer if args.optimizer == 'adam': no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [{ "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) optimizer.zero_grad() elif args.optimizer == 'lamb': from pytorch_lamb import Lamb optimizer = Lamb(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay, betas=(args.adam_b1, args.adam_b2)) # Configure FP16 if args.fp16 and APEX_AVAILABLE: print("Using FP16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level) # Configure scheduler if args.use_scheduler: steps = len(train_loader) * args.epochs // args.accumulation scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=int(steps * args.warmup_pct), num_training_steps=steps) else: scheduler = None print("Using learning rate {:.4E} and weight decay {:.4E}".format( args.learning_rate, args.weight_decay), end='') print(" with scheduler using warmup pct {}".format( args.warmup_pct)) if args.use_scheduler else print("") # Training proper print('\n' + '=' * 50, '\nTRAINING', '\n' + '=' * 50) print("Training batches: {} | Validation batches: {}".format( len(train_loader), len(valid_loader))) for e in range(1, args.epochs + 1): train_loss, train_acc = train(model, criterion, optimizer, train_loader, scheduler=scheduler, accumulation=args.accumulation, device=device, fp16=args.fp16) valid_loss, valid_acc = evaluate(model, criterion, valid_loader, device=device) print( "Epoch {:3} | Train Loss {:.4f} | Train Acc {:.4f} | Valid Loss {:.4f} | Valid Acc {:.4f}" .format(e, train_loss, train_acc, valid_loss, valid_acc)) # Save the model model.save_pretrained(args.checkpoint) tokenizer.save_pretrained(args.checkpoint) #with open(args.checkpoint, 'wb') as f: # torch.save(model.state_dict(), f) if args.do_eval: print('\n' + '=' * 50, '\nBEGIN EVALUATION PROPER', '\n' + '=' * 50) # Load saved tokenizer tokenizer = AutoTokenizer.from_pretrained(args.checkpoint) # Produce hash code for test cache f_string = args.test_data + str(args.msl) + str( args.seed) + args.pretrained hashed = 'cache_' + hashlib.md5(f_string.encode()).hexdigest() + '.pt' # Produce the dataset if cache doesn't exist if hashed not in os.listdir() or args.retokenize_data: print("Producing test data cache. This will take a while.") s = time.time() df = pd.read_csv(args.test_data, lineterminator='\n') text, labels = df[t_columns].values, df[l_columns].values test_dataset = process_data(text, labels, tokenizer, msl=args.msl) if args.save_cache: print('Saving data cache') with open(hashed, 'wb') as f: torch.save(test_dataset, f) print("Preprocessing finished. Time elapsed: {:.2f}s".format( time.time() - s)) # Load the dataset if the cache exists else: print('Cache found. Loading test data.') with open(hashed, 'rb') as f: test_dataset = torch.load(f) # Dataloaders test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False) # Produce the model print("Loading finetuned checkpoint") model = AutoModelForSequenceClassification.from_pretrained( args.checkpoint) model = model.to(device) criterion = torch.nn.CrossEntropyLoss( ) if num_labels == 1 else torch.nn.BCEWithLogitsLoss() # Testing proper print('\n' + '=' * 50, '\nTESTING', '\n' + '=' * 50) test_loss, test_acc = evaluate(model, criterion, test_loader, device=device) print("Test Loss {:.4f} | Test Accuracy {:.4f}".format( test_loss, test_acc)) # Logging if not args.do_train: train_loss, train_acc, valid_loss, valid_acc = None, None, None, None if not args.do_eval: test_loss, test_acc = None, None return train_loss, train_acc, valid_loss, valid_acc, test_loss, test_acc
def train(args, train_dataset, model, tokenizer, pad_token_label_id): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] if args.optimizer.lower() == "adamw": optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) elif args.optimizer.lower() == "lamb": optimizer = Lamb(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) else: raise Exception("Invalid optimizer specified") scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist #if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile( # os.path.join(args.model_name_or_path, "scheduler.pt") #): # Load in optimizer and scheduler states # optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) # scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True, ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.encoder_model_name_or_path): # set global_step to global_step of last saved checkpoint from model path if args.encoder_model_name_or_path.split("-")[-1].split( "/")[0].isdigit(): global_step = int( args.encoder_model_name_or_path.split("-")[-1].split("/")[0]) else: global_step = 0 epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info( " Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch, ) tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange( epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0], ) set_seed(args) # Added here for reproducibility for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) input_ids, output_ids, input_mask, output_mask, _, decoder_ids = batch # add other inputs here, including kwargs inputs = {"input_ids": input_ids, "attention_mask": input_mask} # The output tuple structure depends on the model used and the arguments invoked # For BERT-type models, this is # decoder_predictions, encoded_embeddings, encoded_attention_mask = model(**inputs) # For GPT2-type models, this at least starts with the decoder predictions # See the EncoderDecoderModel class for more details output = model(**inputs) decoder_predictions = output[0] vocab_size = decoder_predictions.shape[-1] loss_fct = CrossEntropyLoss() loss = loss_fct( decoder_predictions.view(-1, vocab_size), output_ids.view(-1), ) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if ( args.local_rank == -1 and args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well results, _ = evaluate(args, model, tokenizer, pad_token_label_id, mode="dev") for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar( "loss", (tr_loss - logging_loss) / args.logging_steps, global_step, ) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
if distributed: torch.distributed.init_process_group(backend='nccl') rank = torch.distributed.get_rank() torch.cuda.set_device(rank) dataset = enwik8() if not fresh: with open('model.pt', 'rb') as f: model = torch.load(f) else: model = SHARNN(n_token=dataset.n_token, embed_dim=1024, hidden_dim=4096, ff_dim=2048, n_layers=4, heads=1, max_len=5000, dropout=0.1, tied=True) model.to(device) if distributed: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[rank], output_device=rank, dim=1, find_unused_parameters=True) # optim = torch.optim.Adam(model.parameters(), lr=0.002) from pytorch_lamb import Lamb optim = Lamb(model.parameters(), lr=0.002, min_trust=0.25) crit = nn.CrossEntropyLoss().to(device) # sched = torch.optim.lr_scheduler.CosineAnnealingLR(optim, epochs) sched = torch.optim.lr_scheduler.ReduceLROnPlateau(optim, patience=2) scaler = torch.cuda.amp.GradScaler() if True: train(model, crit, optim, sched, dataset, epochs) test_loss = evaluate(model, crit, dataset.test_data) print(f"Test | loss {test_loss:.3f} | ppl {math.exp(test_loss):.3f} | bpc {test_loss / math.log(2):.3f}") exit()
# At any point you can hit Ctrl + C to break out of training early. try: optimizer = None # Ensure the optimizer is optimizing params, which includes both the model's weights as well as the criterion's weight (i.e. Adaptive Softmax) if args.optimizer == 'sgd': optimizer = torch.optim.SGD(params, lr=args.lr, weight_decay=args.wdecay) if args.optimizer == 'adagrad': optimizer = torch.optim.Adagrad(params, lr=args.lr, weight_decay=args.wdecay) if args.optimizer == 'adam': optimizer = torch.optim.Adam(params, lr=args.lr, weight_decay=args.wdecay) if args.optimizer == 'adamw': optimizer = torch.optim.AdamW(params, lr=args.lr, weight_decay=args.wdecay) if args.optimizer == 'lamb': from pytorch_lamb import Lamb optimizer = Lamb(params, lr=args.lr, weight_decay=args.wdecay, min_trust=0.25) #optimizer = Lamb(params, lr=args.lr, weight_decay=args.wdecay, min_trust=0.1) #optimizer = Lamb(params, lr=args.lr, weight_decay=args.wdecay, min_trust=0, random_min_trust=0.2, random_trust_dice=10) #optimizer = Lamb(params, lr=args.lr, weight_decay=args.wdecay, min_trust=0.2, random_min_trust=0.5, random_trust_dice=4) from lookahead import Lookahead if False: k, alpha = 5, 0.8 print('Lookahead - k {} and alpha {}'.format(k, alpha)) optimizer = Lookahead(base_optimizer=optimizer, k=k, alpha=alpha) from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') #model, optimizer = amp.initialize(model, optimizer, opt_level='O2') for epoch in range(1, args.epochs+1): epoch_start_time = time.time()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--train_dir', type=str, help='Directory of training samples') parser.add_argument('--valid_dir', type=str, help='Directory of validation samples') parser.add_argument('--test_dir', type=str, help='Directory of test samples') parser.add_argument('--src_vocab', type=str, help='SentencePiece vocabulary file for source sentence') parser.add_argument('--trg_vocab', type=str, help='SentencePiece vocabulary file for target sentence') parser.add_argument('--src_msl', type=int, default=100, help='Maximum sequence length for source sentence') parser.add_argument('--trg_msl', type=int, default=100, help='Maximum sequence length for target sentence') parser.add_argument('--batch_size', type=int, default=8, help='Batch size') parser.add_argument('--num_workers', type=int, default=1, help='Number of parallel workers for dataloading') parser.add_argument('--save_dir', type=str, help='Directory to save checkpoints and load checkpoints from') parser.add_argument('--epochs', type=int, default=10, help='Number of epochs to train') parser.add_argument('--tie_weights', action='store_true', help='Tie weights of encoder/decoder embeddings and projection layer') parser.add_argument('--hidden_dim', type=int, default=256, help='Hidden dimensions of the transformer layers') parser.add_argument('--n_layers', type=int, default=3, help='Number of transformer blocks in the encoder and decoder') parser.add_argument('--n_heads', type=int, default=8, help='Number of attention heads') parser.add_argument('--pf_dim', type=int, default=512, help='Positionwise feedforward projection dimension') parser.add_argument('--dropout', type=float, default=0.1, help='Dropout probability') parser.add_argument('--clip', type=float, default=1.0, help='Gradient clipping') parser.add_argument('--criterion', type=str, default='cross_entropy', choices=['cross_entropy', 'label_smoothing'], help='Criterion to use') parser.add_argument('--smoothing', type=float, default=0.0, help='Label smoothing factor') parser.add_argument('--optimizer', type=str, default='adam', choices=['adam', 'adamw', 'lamb'], help='Optimizer to use') parser.add_argument('--learning_rate', type=float, default=3e-4, help='Learning rate') parser.add_argument('--weight_decay', type=float, default=0.0, help='Weight decay for non LayerNorm and Bias layers') parser.add_argument('--adam_epsilon', type=float, default=1e-9, help='Epsilon value for Adam') parser.add_argument('--adam_b1', type=float, default=0.9, help='Beta1 for LAMB') parser.add_argument('--adam_b2', type=float, default=0.99, help='Beta2 for LAMB') parser.add_argument('--scheduler', type=str, default=None, choices=['cosine', 'linear', 'noam', None], help='Scheduler to use') parser.add_argument('--warmup_pct', type=float, default=0.1, help='Percentage of steps to warmup for linear scheduler') parser.add_argument('--warmup_steps', type=int, default=4000, help='Number of warmup steps for noam scheduling') parser.add_argument('--do_train', action='store_true', help='Train a model') parser.add_argument('--do_test', action='store_true', help='Evaluate a model') parser.add_argument('--resume_training', action='store_true', help='Toggle to resume from checkpoint in --save_dir') parser.add_argument('--no_cuda', action='store_true', help='Do not use GPU') parser.add_argument('--fp16', action='store_true', help='Use FP16 training via APEX') parser.add_argument('--opt_level', type=str, default='O1', choices=['O1', 'O2'], help='Optimization level for FP16 training') parser.add_argument('--save_every', type=int, default=1, help='Save checkpoint every --save_every epoch') parser.add_argument('--pad_token', type=str, default='<pad>', help='Override default padding token') parser.add_argument('--use_swa', action='store_true', help='Use stochastic weight averaging') parser.add_argument('--swa_pct', type=float, help='Last percentage of total training steps to average') parser.add_argument('--swa_times', type=int, help='Number of times to average over swa_pct') parser.add_argument('--seed', type=int, default=1111, help='Random seed') args = parser.parse_args() print(args) # Set seeds torch.manual_seed(args.seed) device = torch.device('cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu') if args.do_train: # Produce dataloaders print("Producing dataloaders.") train_dataset = TextDataset(args.train_dir, args.src_vocab, args.trg_vocab, src_msl=args.src_msl, trg_msl=args.trg_msl) train_sampler = torch.utils.data.RandomSampler(train_dataset) train_loader = torch.utils.data.DataLoader(train_dataset, sampler=train_sampler, batch_size=args.batch_size, collate_fn=collate_fn, num_workers=args.num_workers) valid_dataset = TextDataset(args.valid_dir, args.src_vocab, args.trg_vocab, src_msl=args.src_msl, trg_msl=args.trg_msl) valid_loader = torch.utils.data.DataLoader(valid_dataset, shuffle=False, batch_size=args.batch_size, collate_fn=collate_fn, num_workers=args.num_workers) print("Training batches: {}\nValidation batches: {}".format(len(train_loader), len(valid_loader))) # Produce model and criterion encoder = Encoder(vocab_sz=train_dataset.src_vocab_sz, hidden_dim=args.hidden_dim, n_layers=args.n_layers, n_heads=args.n_heads, pf_dim=args.pf_dim, dropout=args.dropout, msl=args.src_msl, fp16=args.fp16) decoder = Decoder(vocab_sz=train_dataset.trg_vocab_sz, hidden_dim=args.hidden_dim, n_layers=args.n_layers, n_heads=args.n_heads, pf_dim=args.pf_dim, dropout=args.dropout, msl=args.trg_msl, fp16=args.fp16) model = Seq2Seq(encoder, decoder, train_dataset.src_word2idx[args.pad_token], train_dataset.trg_word2idx[args.pad_token], tie_weights=args.tie_weights).to(device) # Configure SWA if args.use_swa: swa_model = AveragedModel(model) else: swa_model = None # Produce criterion if args.criterion == 'cross_entropy': criterion = nn.CrossEntropyLoss(ignore_index=train_dataset.src_word2idx[args.pad_token]) elif args.criterion == 'label_smoothing': criterion = LabelSmoothingLoss(epsilon=args.smoothing) # Produce Optimizer no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [{"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay}, {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}] if args.optimizer == 'adamw': try: from transformers import AdamW optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, betas=(args.adam_b1, args.adam_b2)) except ModuleNotFoundError: print("Transformers module not found for AdamW. Using generic Adam instead.") optimizer = optim.Adam(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, betas=(args.adam_b1, args.adam_b2)) elif args.optimizer == 'lamb': try: from pytorch_lamb import Lamb optimizer = Lamb(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay, betas=(args.adam_b1, args.adam_b2)) except ModuleNotFoundError: print("LAMB implementation not found. Using generic Adam instead.") optimizer = optim.Adam(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, betas=(args.adam_b1, args.adam_b2)) elif args.optimizer == 'adam': optimizer = optim.Adam(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, betas=(args.adam_b1, args.adam_b2)) # Configure FP16 if args.fp16 and APEX_AVAILABLE: model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level) # Produce the scheduler if args.scheduler == 'cosine': scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, len(train_loader) * args.epochs) elif args.scheduler == 'linear': try: from transformers import get_linear_schedule_with_warmup steps = args.epochs * len(train_loader) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(steps * args.warmup_pct), num_training_steps=steps) except ModuleNotFoundError: print('Transformers module not found for Linear Schedule. Not using a scheduler instead.') scheduler = None elif args.scheduler == 'noam': scheduler = NoamLR(optimizer, warmup_steps=args.warmup_steps) else: scheduler = None print("\nUsing {} optimizer with {} scheduling. Optimizing via {}.".format(str(type(optimizer)), str(type(scheduler)), str(type(criterion)))) print("Model has {:,} trainable parameters.\n".format(count_parameters(model))) # Configure states if resuming from checkpoint if args.resume_training: print("Loading from checkpoint...", end='') with open(args.save_dir + '/model.bin', 'rb') as f: model.load_state_dict(torch.load(f)) print('Model loaded...', end='') if args.use_swa: with open(args.save_dir + '/swa_model.bin', 'rb') as f: swa_model.load_state_dict(torch.load(f)) print('SWA Model loaded...', end='') with open(args.save_dir + '/training.bin', 'rb') as f: training_state = torch.load(f) optimizer.load_state_dict(training_state['opt_state']) e = training_state['e'] + 1 # Start on the next epoch print('Optimizer loaded...', end='') if training_state['scheduler'] is not None: scheduler.load_state_dict(training_state['scheduler']) print('Scheduler loaded...', end='') else: print('No scheduler found...') global_steps = len(train_loader) * (e - 1) print("Done!\nLoaded checkpoint from epoch {} | Global step {}!".format(training_state['e'], global_steps)) # Else, begin from epoch 1 else: print("Beginning training from epoch 1.") e = 1 global_steps = 0 # Print training setup total_steps = len(train_loader) * (args.epochs) print('Total number of steps: {}'.format(total_steps)) # Configure SWA points if args.use_swa: swa_every = sorted(list(set([round(total_steps * (1 - args.swa_pct * i)) for i in range(args.swa_times)]))) print('SWA on steps: {}\n'.format(swa_every)) else: swa_every = None print('\n') # Train Model while e < args.epochs + 1: # Train one epoch train_loss, global_steps = train(model, criterion, optimizer, train_loader, global_steps, device=device, clip=args.clip, scheduler=scheduler, fp16=args.fp16, swa=args.use_swa, swa_every=swa_every, swa_model=swa_model) valid_loss = evaluate(model, criterion, valid_loader, device=device) print("Epoch {:3} | Train Loss {:.4f} | Train Ppl {:.4f} | Valid Loss {:.4f} | Valid Ppl {:.4f}".format(e, train_loss, np.exp(train_loss), valid_loss, np.exp(valid_loss))) # Save the checkpoint if e % args.save_every == 0 or e == args.epochs: print('Saving checkpoint for epoch {}...'.format(e), end='') save_checkpoint(model, args, optimizer=optimizer, e=e, scheduler=scheduler, save_state=True, swa_model=swa_model) print('Done!') # Update epoch number e += 1 # Evaluate again and save if we're using SWA if args.use_swa: print('Evaluating on final averaged model.') valid_loss = evaluate(swa_model, criterion, valid_loader, device=device) print("Valid Loss {:.4f} | Valid Ppl {:.4f}".format(valid_loss, np.exp(valid_loss))) print('Saving checkpoint for averaged model.') save_checkpoint(model, args, optimizer=optimizer, e=e, scheduler=scheduler, save_state=True, swa_model=swa_model) print('Done!') print("Training done!\n") if args.do_test: # Produce dataloaders print("Producing test loaders.") test_dataset = TextDataset(args.test_dir, args.src_vocab, args.trg_vocab, src_msl=args.src_msl, trg_msl=args.trg_msl) test_loader = torch.utils.data.DataLoader(test_dataset, shuffle=False, batch_size=args.batch_size, collate_fn=collate_fn, num_workers=args.num_workers) print("Number of testing batches: {}".format(len(test_loader))) # Produce setup print("Loading model and saved settings.") with open(args.save_dir + '/settings.bin', 'rb') as f: hd, nl, nh, pf, dp, smsl, tmsl, tw, usw, cri = torch.load(f) encoder = Encoder(vocab_sz=test_dataset.src_vocab_sz, hidden_dim=hd, n_layers=nl, n_heads=nh, pf_dim=pf, dropout=dp, msl=smsl, fp16=args.fp16) decoder = Decoder(vocab_sz=test_dataset.trg_vocab_sz, hidden_dim=hd, n_layers=nl, n_heads=nh, pf_dim=pf, dropout=dp, msl=tmsl, fp16=args.fp16) model = Seq2Seq(encoder, decoder, test_dataset.src_word2idx[args.pad_token], test_dataset.trg_word2idx[args.pad_token], tie_weights=tw) if cri == 'cross_entropy': criterion = nn.CrossEntropyLoss(ignore_index=test_dataset.trg_word2idx[args.pad_token]) elif cri == 'label_smoothing': criterion = LabelSmoothingLoss(epsilon=args.smoothing) # Load the checkpoint if usw: swa_model = AveragedModel(model) with open(args.save_dir + '/swa_model.bin', 'rb') as f: swa_model.load_state_dict(torch.load(f)) swa_model = swa_model.to(device) with open(args.save_dir + '/model.bin', 'rb') as f: model.load_state_dict(torch.load(f)) model = model.to(device) print("\nBegin testing.") test_loss = evaluate(model, criterion, test_loader, device=device) print("Test Loss {:.4f} | Test Ppl {:.4f}".format(test_loss, np.exp(test_loss))) if usw: print('Testing SWA model.') test_loss = evaluate(swa_model, criterion, test_loader, device=device) print("Test Loss {:.4f} | Test Ppl {:.4f}".format(test_loss, np.exp(test_loss)))