def main(): # get args args = get_args() # set up gpus os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu assert torch.cuda.is_available() # set up writer, logger, and save directory for models save_root = os.path.join('checkpoints', 'dverge', 'seed_{:d}'.format(args.seed), '{:d}_{:s}{:d}_eps_{:.2f}'.format( args.model_num, args.arch, args.depth, args.distill_eps) ) if args.distill_fixed_layer: save_root += '_fixed_layer_{:d}'.format(args.distill_layer) if args.plus_adv: save_root += '_plus_adv_coeff_{:.1f}'.format(args.dverge_coeff) if args.start_from == 'scratch': save_root += '_start_from_scratch' if not os.path.exists(save_root): os.makedirs(save_root) else: print('*********************************') print('* The checkpoint already exists *') print('*********************************') writer = SummaryWriter(save_root.replace('checkpoints', 'runs')) # dump configurations for potential future references with open(os.path.join(save_root, 'cfg.json'), 'w') as fp: json.dump(vars(args), fp, indent=4, sort_keys=True) with open(os.path.join(save_root.replace('checkpoints', 'runs'), 'cfg.json'), 'w') as fp: json.dump(vars(args), fp, indent=4, sort_keys=True) # set up random seed torch.manual_seed(args.seed) random.seed(args.seed) # initialize models if args.start_from == 'baseline': args.model_file = os.path.join('checkpoints', 'baseline', 'seed_0', '{:d}_{:s}{:d}'.format(args.model_num, args.arch, args.depth), 'epoch_200.pth') elif args.divtrain_start_from == 'scratch': args.model_file = None models = utils.get_models(args, train=True, as_ensemble=False, model_file=args.model_file) # get data loaders trainloader, testloader = utils.get_loaders(args) # get optimizers and schedulers optimizers = utils.get_optimizers(args, models) schedulers = utils.get_schedulers(args, optimizers) # train the ensemble trainer = DVERGE_Trainer(models, optimizers, schedulers, trainloader, testloader, writer, save_root, **vars(args)) trainer.run()
def main(): # get args args = get_args() # set up gpus os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu assert torch.cuda.is_available() # set up writer, logger, and save directory for models save_root = os.path.join( 'checkpoints', 'baseline', 'seed_{:d}'.format(args.seed), '{:d}_{:s}{:d}'.format(args.model_num, args.arch, args.depth)) if not os.path.exists(save_root): os.makedirs(save_root) else: print('*********************************') print('* The checkpoint already exists *') print('*********************************') writer = SummaryWriter(save_root.replace('checkpoints', 'runs')) # dump configurations for potential future references with open(os.path.join(save_root, 'cfg.json'), 'w') as fp: json.dump(vars(args), fp, indent=4) with open( os.path.join(save_root.replace('checkpoints', 'runs'), 'cfg.json'), 'w') as fp: json.dump(vars(args), fp, indent=4) # set up random seed torch.manual_seed(args.seed) # initialize models models = utils.get_models(args, train=True, as_ensemble=False, model_file=None) # get data loaders trainloader, testloader = utils.get_loaders(args) # get optimizers and schedulers optimizers = utils.get_optimizers(args, models) schedulers = utils.get_schedulers(args, optimizers) # train the ensemble trainer = Baseline_Trainer(models, optimizers, schedulers, trainloader, testloader, writer, save_root, **vars(args)) trainer.run()
choices=utils.get_datasets()) # Restart train or continue PARSER.add_argument("--restart", action='store_true') # Learning rate decay arguments PARSER.add_argument("--lr_decay", action="store_true") PARSER.add_argument("--lr_decay_epochs", type=int, default=25) PARSER.add_argument("--lr_decay_factor", type=float, default=0.1) # L2 regularization arguments PARSER.add_argument("--l2_penalty", type=float, default=0.0) # Optimization arguments PARSER.add_argument("--optimizer", choices=utils.get_optimizers(), default="MomentumOptimizer") PARSER.add_argument("--optimizer_args", type=json.loads, default=''' { "learning_rate": 1e-2, "momentum": 0.9 }''') PARSER.add_argument("--batch_size", type=int, default=128) PARSER.add_argument("--epochs", type=int, default=150) # Hardware PARSER.add_argument("--train_device", default="/gpu:0") PARSER.add_argument("--eval_device", default="/gpu:0")
def train_without_trainer(args): logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) device = torch.device("cuda:0" if args['--cuda'] else "cpu") batch_size = int(args['--batch-size']) logging_steps = int(args['--log-every']) tokenizer = transformers.AlbertTokenizer.from_pretrained( 'albert-base-v2', cache_dir=cache_dir) albert_for_math_config = transformers.AlbertConfig( hidden_size=768, num_attention_heads=12, intermediate_size=3072, ) print('Loading Data...') train_data = torch.load( './data/train_data_train-easy_algebra__linear_1d.pt') dev_data = torch.load('./data/dev_data_train-easy_algebra__linear_1d.pt') print('Finished loading data') data_collator = AnswerMaskDataCollator(tokenizer) train_dataloader = torch.utils.data.DataLoader( train_data, batch_size=batch_size, sampler=torch.utils.data.sampler.RandomSampler(train_data), collate_fn=data_collator.collate_batch) if args['--load']: model = transformers.AlbertForMaskedLM.from_pretrained( args['--load-from']) optimizer = get_optimizers(model, float(args['--lr'])) optimizer.load_state_dict( torch.load(os.path.join(args['--load-from'], "optimizer.pt"), map_location=device)) global_step = int(args['--load-from'].split("-")[-1].split("/")[0]) epochs_trained = global_step // (len(train_dataloader)) steps_trained_in_current_epoch = global_step % len(train_dataloader) epoch = epochs_trained logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) else: model = transformers.AlbertForMaskedLM(albert_for_math_config) optimizer = get_optimizers(model, float(args['--lr'])) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 epoch = 0 model.to(device) for state in optimizer.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.to(device) max_epoch = int(args['--max-epoch']) t_total = len(train_dataloader) * max_epoch tr_loss = 0.0 logging_loss = 0.0 min_eval_loss = 1e20 # might be too high valid_niter = int(args['--valid-niter']) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_data)) logger.info(" Num Epochs = %d", max_epoch) logger.info(" train batch size = %d", batch_size) logger.info(" Total optimization steps = %d", t_total) num_eval_samples = 4096 checkpoint_prefix = 'checkpoint' while (epoch < max_epoch): epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, inputs in enumerate(epoch_iterator): if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue tr_loss += train_step(model, inputs, device) torch.nn.utils.clip_grad_norm_(model.parameters(), float(args['--clip-grad'])) optimizer.step() model.zero_grad() global_step += 1 if global_step % logging_steps == 0: logs: Dict[str, float] = {} logs["loss"] = (tr_loss - logging_loss) / logging_steps logs["lr"] = (optimizer.defaults['lr'] ) # possible RuntimeError logs["epoch"] = epoch logs["step"] = global_step logging_loss = tr_loss log(logs) if global_step % valid_niter == 0: eval_loss = 0.0 description = "Evaluation" sampler = torch.utils.data.sampler.SequentialSampler( dev_data[:num_eval_samples]) eval_dataloader = torch.utils.data.DataLoader( dev_data[:num_eval_samples], sampler=sampler, batch_size=batch_size, collate_fn=data_collator.collate_batch, ) logger.info("***** Running %s *****", description) logger.info(" Num Examples = %d", num_eval_samples) logger.info(" Batch size = %d", batch_size) for inputs in tqdm(eval_dataloader, desc=description): for k, v in inputs.items(): inputs[k] = v.to(device) model.eval() with torch.no_grad(): outputs = model(**inputs) loss = outputs[0] eval_loss += loss.item() print("\nEvaluation loss = %f" % (eval_loss / num_eval_samples)) if eval_loss / num_eval_samples * batch_size < min_eval_loss: min_eval_loss = eval_loss / num_eval_samples * batch_size # save model and optimizer output_dir = os.path.join( args['--save-to'] + '/validations/', f"{checkpoint_prefix}-{global_step}") os.makedirs(output_dir, exist_ok=True) model.save_pretrained(output_dir) output_dir = os.path.join(args['--save-to'] + '/validations/') rotate_checkpoints(output_dir) output_dir = os.path.join( args['--save-to'] + '/validations/', f"{checkpoint_prefix}-{global_step}") torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) if global_step % int(args['--save-every']) == 0: output_dir = os.path.join( args['--save-to'], f"{checkpoint_prefix}-{global_step}") os.makedirs(output_dir, exist_ok=True) model.save_pretrained(output_dir) output_dir = output_dir = os.path.join(args['--save-to']) rotate_checkpoints(output_dir) output_dir = os.path.join( args['--save-to'], f"{checkpoint_prefix}-{global_step}") torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) epoch_iterator.close() epoch += 1 logger.info( "\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n" )
def main(): # get args args = get_args() # set up gpus os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu assert torch.cuda.is_available() # set up writer, logger, and save directory for models save_root = os.path.join('checkpoints', 'transfer', 'seed_{:d}'.format(args.seed), '{:s}{:d}'.format(args.arch, args.depth)) save_root += "%.2f" % (args.transfer_coeff) if not os.path.exists(save_root): os.makedirs(save_root) else: print('*********************************') print('* The checkpoint already exists *') print('*********************************') writer = SummaryWriter(save_root.replace('checkpoints', 'runs')) # dump configurations for potential future references with open(os.path.join(save_root, 'cfg.json'), 'w') as fp: json.dump(vars(args), fp, indent=4) with open( os.path.join(save_root.replace('checkpoints', 'runs'), 'cfg.json'), 'w') as fp: json.dump(vars(args), fp, indent=4) # set up random seed torch.manual_seed(args.seed) # initialize models models = utils.get_models(args, train=True, as_ensemble=False, model_file="/sync_transfer/CIFAR/epoch_200.pth", dataset="CIFAR-10") # get data loaders source_trainloader, source_testloader = utils.get_loaders( args, dataset="CIFAR-10") target_trainloader, target_testloader = utils.get_loaders(args, dataset="STL-10") # get optimizers and schedulers optimizers = utils.get_optimizers(args, models) schedulers = utils.get_schedulers(args, optimizers) surrogate = utils.get_models(args, train=False, as_ensemble=False, model_file="/sync_transfer/STL/epoch_200.pth", dataset="STL-10") trainer = Transfer_Trainer(models, optimizers, schedulers, source_trainloader, source_testloader, target_trainloader, surrogate, writer, save_root, **vars(args)) trainer.run()