clip = abs(args.clip) #settings.MILESTONES = [120,150,180] optimizer = FromageCSV7(net.parameters(), lr=args.lr, weight_decay=args.wd, bias_clip=clip, beta1=args.momentum, beta2=args.beta) elif args.optimizer == 'fromage': print("using fromage!") if args.clip == 0: clip = math.inf else: clip = abs(args.clip) #settings.MILESTONES = [120,150,180] optimizer = Fromage(net.parameters(), lr=args.lr) elif args.optimizer == 'madam': print("using madam!") if args.clip == 0: clip = math.inf else: clip = abs(args.clip) #settings.MILESTONES = [120,150,180] optimizer = Madam(net.parameters(), lr=args.lr) if args.sch == "step": train_scheduler = optim.lr_scheduler.MultiStepLR( optimizer, milestones=settings.MILESTONES, gamma=args.gamma) #learning rate decay elif args.sch == "poly": train_scheduler = PolyLR(optimizer,
def main(): if not torch.cuda.is_available(): logging.info('no gpu device available') sys.exit(1) # set seeds np.random.seed(args.seed) cudnn.benchmark = True torch.manual_seed(args.seed) cudnn.enabled = True torch.cuda.manual_seed(args.seed) logging.info('args = %s', args) # Get data loaders. traindir = os.path.join(args.data, 'train') validdir = os.path.join(args.data, 'val') # data augmentation normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_transform = transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.2), transforms.ToTensor(), normalize, ]) val_transform = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ]) train_data = dset.ImageFolder(traindir, transform=train_transform) valid_data = dset.ImageFolder(validdir, transform=val_transform) # dataset split valid_data, test_data = utils.dataset_split(valid_data, len(valid_data)) train_sampler = torch.utils.data.distributed.DistributedSampler(train_data) train_queue = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=8, sampler=train_sampler) valid_queue = torch.utils.data.DataLoader(valid_data, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=8) test_queue = torch.utils.data.DataLoader(test_data, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=8) # Create model and loss. torch.hub.set_dir('/tmp/hub_cache_%d' % args.local_rank) model = torch.hub.load('pytorch/vision:v0.4.2', 'resnet50', pretrained=False) model = model.cuda() model = DDP(model, delay_allreduce=True) criterion = nn.CrossEntropyLoss() criterion = criterion.cuda() criterion_smooth = CrossEntropyLabelSmooth(CLASSES, args.label_smooth) criterion_smooth = criterion_smooth.cuda() # Set up network weights optimizer. if args.optimizer == 'SGD': optimizer = torch.optim.SGD(model.parameters(), args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer == 'fromage': optimizer = Fromage(model.parameters(), args.learning_rate) elif args.optimizer == 'adam': optimizer = torch.optim.Adam(model.parameters(), args.learning_rate, weight_decay=args.weight_decay) else: raise NotImplementedError scheduler = torch.optim.lr_scheduler.StepLR(optimizer, gamma=0.1, step_size=30) # Train. global_step = 0 best_acc_top1 = 0 for epoch in range(args.epochs): # Shuffle the sampler, update lrs. train_queue.sampler.set_epoch(epoch + args.seed) # Training. train_acc_top1, train_acc_top5, train_obj, global_step = train( train_queue, model, criterion_smooth, optimizer, global_step) logging.info('epoch %d train_acc %f', epoch, train_acc_top1) writer.add_scalar('train/loss', train_obj, global_step) writer.add_scalar('train/acc_top1', train_acc_top1, global_step) writer.add_scalar('train/acc_top5', train_acc_top5, global_step) writer.add_scalar('train/lr', optimizer.state_dict()['param_groups'][0]['lr'], global_step) # Validation. valid_acc_top1, valid_acc_top5, valid_obj = infer( valid_queue, model, criterion) logging.info('valid_acc_top1 %f', valid_acc_top1) logging.info('valid_acc_top5 %f', valid_acc_top5) writer.add_scalar('val/acc_top1', valid_acc_top1, global_step) writer.add_scalar('val/acc_top5', valid_acc_top5, global_step) writer.add_scalar('val/loss', valid_obj, global_step) # Test test_acc_top1, test_acc_top5, test_obj = infer(test_queue, model, criterion) logging.info('test_acc_top1 %f', test_acc_top1) logging.info('test_acc_top5 %f', test_acc_top5) writer.add_scalar('test/acc_top1', test_acc_top1, global_step) writer.add_scalar('test/acc_top5', test_acc_top5, global_step) writer.add_scalar('test/loss', test_obj, global_step) is_best = False if valid_acc_top1 > best_acc_top1: best_acc_top1 = valid_acc_top1 is_best = True if args.local_rank == 0: utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_acc_top1': best_acc_top1, 'optimizer': optimizer.state_dict(), }, is_best, args.save) # Update LR. scheduler.step() writer.flush()
stored_loss = 100000000 # At any point you can hit Ctrl + C to break out of training early. try: #optimizer = None # Ensure the optimizer is optimizing params, which includes both the model's weights as well as the criterion's weight (i.e. Adaptive Softmax) if args.optimizer == 'sgd': optimizer = torch.optim.SGD(params, lr=args.lr, weight_decay=args.wdecay) if args.optimizer == 'adam': optimizer = torch.optim.Adam(params, lr=args.lr, weight_decay=args.wdecay) if args.optimizer == 'fromage': optimizer = Fromage(params, lr=args.lr) if args.optimizer == 'adamw': optimizer = AdamW(params, lr=args.lr, weight_decay=args.wdecay) if args.optimizer == 'radam': optimizer = RAdam(params, lr=args.lr, weight_decay=args.wdecay) if args.optimizer.lower() == 'adabelief': optimizer = AdaBelief(params, lr=args.lr, weight_decay=args.wdecay, eps=args.eps, betas=(args.beta1, args.beta2)) if args.optimizer == 'adabound': optimizer = AdaBound(params, lr=args.lr, weight_decay=args.wdecay, final_lr=30,
ntokens = len(corpus.dictionary) if args.model == 'Transformer': model = model.TransformerModel(ntokens, args.emsize, args.nhead, args.nhid, args.nlayers, args.dropout).to(device) else: model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied).to(device) criterion = nn.NLLLoss() if args.optim == 'sgd': optimizer = torch.optim.SGD(model.parameters(), lr=args.lr) if args.optim == 'adam': optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) if args.optim == 'fromage': optimizer = Fromage(model.parameters(), lr=args.lr, p_bound=args.p_bound) ############################################################################### # Training code ############################################################################### def repackage_hidden(h): """Wraps hidden states in new Tensors, to detach them from their history.""" if isinstance(h, torch.Tensor): return h.detach() else: return tuple(repackage_hidden(v) for v in h)
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter(log_dir="runs/" + args.output_dir) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] if args.optim == "adam": optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) print(f"\n using Adam with lr {args.learning_rate}\n") elif args.optim == "fromage": optimizer = Fromage(optimizer_grouped_parameters, lr=args.learning_rate) print(f"\n using fromage with lr {args.learning_rate}\n") elif args.optim == "SGD": optimizer = torch.optim.SGD(optimizer_grouped_parameters, lr=args.learning_rate) print(f"\n using SGD with lr {args.learning_rate}\n") else: raise Exception("that optim is not implemented") scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join( args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt")): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 1 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split("-")[-1].split( "/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) # Added here for reproductibility set_seed(args) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "start_positions": batch[3], "end_positions": batch[4], } if args.model_type in ["xlm", "roberta", "distilbert"]: del inputs["token_type_ids"] if args.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[5], "p_mask": batch[6]}) if args.version_2_with_negative: inputs.update({"is_impossible": batch[7]}) outputs = model(**inputs) # model outputs are always tuple in transformers (see doc) loss = outputs[0] if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel (not distributed) training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 # Log metrics if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Only evaluate when single GPU otherwise metrics may not average well if args.local_rank == -1 and args.evaluate_during_training: results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss # Save model checkpoint if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: output_dir = os.path.join( args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Take care of distributed/parallel training model_to_save = model.module if hasattr( model, "module") else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
netG = Generator().to(device) netD = Discriminator().to(device) print("Generator:") print(f"{sum(p.numel() for p in netG.parameters())} parameters") print(f"{len(list(netG.parameters()))} tensors") print("\nDiscriminator:") print(f"{sum(p.numel() for p in netD.parameters())} parameters") print(f"{len(list(netD.parameters()))} tensors") if args.optim == 'sgd': optG = torch.optim.SGD(netG.parameters(), lr=args.initial_lr) optD = torch.optim.SGD(netD.parameters(), lr=args.initial_lr) elif args.optim == 'fromage': optG = Fromage(netG.parameters(), lr=args.initial_lr) optD = Fromage(netD.parameters(), lr=args.initial_lr) elif args.optim == 'lars': optG = Lars(netG.parameters(), lr=args.initial_lr) optD = Lars(netD.parameters(), lr=args.initial_lr) elif args.optim == 'adam': optG = torch.optim.Adam(netG.parameters(), lr=args.initial_lr, betas=(0.0, 0.999), eps=1e-08) optD = torch.optim.Adam(netD.parameters(), lr=args.initial_lr, betas=(0.0, 0.999), eps=1e-08) else: raise Exception("Unsupported optim")