def train(epoch, model, optimizer, scheduler): global global_step epoch_loss = 0.0 running_num = 0 running_loss = np.zeros(3) train_sampler.set_epoch(epoch) model.train() bar = tqdm(train_loader) if args.local_rank == 0 else train_loader for batch_idx, (x, c) in enumerate(bar): scheduler.step() global_step += 1 x, c = x.to(device, non_blocking=True), c.to(device, non_blocking=True) optimizer.zero_grad() log_p, logdet = model(x, c) log_p, logdet = torch.mean(log_p), torch.mean(logdet) loss = -(log_p + logdet) with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() nn.utils.clip_grad_norm_(amp.master_params(optimizer), 1.) optimizer.step() running_num += 1 running_loss[0] += loss.item() running_loss[1] += log_p.item() running_loss[2] += logdet.item() epoch_loss += loss.item() if args.local_rank == 0: bar.set_description('{}/{}, [Log pdf, Log p(z), Log Det] : {}' .format(epoch, global_step, running_loss / running_num)) if (batch_idx + 1) % 100 == 0: running_num = 0 running_loss = np.zeros(3) del x, c, log_p, logdet, loss del running_loss gc.collect() print('{}/{}/{} Training Loss : {:.4f}'.format(epoch, global_step, args.local_rank, epoch_loss / (len(train_loader)))) return epoch_loss / len(train_loader)
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]: """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter(comment=args.summary_comment) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) if args.wiki_dataset: collate_fn = functools.partial(collate_wiki, tokenizer) else: collate_fn = functools.partial(collate, tokenizer) train_dataloader = DataLoader( train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate_fn, ) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if (args.model_name_or_path and os.path.isfile( os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt"))): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True, ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if args.model_name_or_path and os.path.exists(args.model_name_or_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split("-")[-1].split( "/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info( " Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch, ) except ValueError: logger.info(" Starting fine-tuning.") tr_loss, logging_loss = 0.0, 0.0 model_to_resize = model.module if hasattr( model, "module") else model # Take care of distributed/parallel training model_to_resize.resize_token_embeddings(len(tokenizer)) model.zero_grad() train_iterator = trange( epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0], ) set_seed(args) # Added here for reproducibility for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue if args.wiki_dataset: if args.mlm: raise RuntimeError( "Can't do mlm for wiki / dictionary dataset") tokens, loss_mask = batch inputs, labels = (tokens, tokens) loss_mask = loss_mask.to(args.device) loss_weights = (~loss_mask) + loss_mask * args.title_scale inputs = inputs.to(args.device) labels = labels.to(args.device) model.train() outputs = model(inputs, labels=labels, loss_weights=loss_weights) else: inputs, labels = mask_tokens( batch, tokenizer, args) if args.mlm else (batch, batch) inputs = inputs.to(args.device) labels = labels.to(args.device) model.train() outputs = model( inputs, masked_lm_labels=labels) if args.mlm else model( inputs, labels=labels) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if ( args.local_rank == -1 and args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar( "loss", (tr_loss - logging_loss) / args.logging_steps, global_step, ) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: checkpoint_prefix = "checkpoint" # Save model checkpoint output_dir = os.path.join( args.output_dir, "{}-{}".format(checkpoint_prefix, global_step)) os.makedirs(output_dir, exist_ok=True) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) _rotate_checkpoints(args, checkpoint_prefix) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train(args, labeled_trainloader, unlabeled_trainloader, test_loader, model, optimizer, ema_model, scheduler): if args.amp: from apex import amp global best_acc test_accs = [] batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() losses_x = AverageMeter() losses_u = AverageMeter() mask_probs = AverageMeter() end = time.time() labeled_iter = iter(labeled_trainloader) unlabeled_iter = iter(unlabeled_trainloader) model.train() for epoch in range(args.start_epoch, args.epochs): if not args.no_progress: p_bar = tqdm(range(args.eval_step), disable=args.local_rank not in [-1, 0]) for batch_idx in range(args.eval_step): try: inputs_x, targets_x = labeled_iter.next() except: labeled_iter = iter(labeled_trainloader) inputs_x, targets_x = labeled_iter.next() try: (inputs_u_w, inputs_u_s), _ = unlabeled_iter.next() except: unlabeled_iter = iter(unlabeled_trainloader) (inputs_u_w, inputs_u_s), _ = unlabeled_iter.next() data_time.update(time.time() - end) batch_size = inputs_x.shape[0] inputs = interleave(torch.cat((inputs_x, inputs_u_w, inputs_u_s)), 2 * args.mu + 1).to(args.device) targets_x = targets_x.to(args.device) logits = model(inputs) logits = de_interleave(logits, 2 * args.mu + 1) logits_x = logits[:batch_size] logits_u_w, logits_u_s = logits[batch_size:].chunk(2) del logits Lx = F.cross_entropy(logits_x, targets_x, reduction='mean') pseudo_label = torch.softmax(logits_u_w.detach() / args.T, dim=-1) max_probs, targets_u = torch.max(pseudo_label, dim=-1) mask = max_probs.ge(args.threshold).float() Lu = (F.cross_entropy(logits_u_s, targets_u, reduction='none') * mask).mean() loss = Lx + args.lambda_u * Lu if args.amp: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() losses.update(loss.item()) losses_x.update(Lx.item()) losses_u.update(Lu.item()) optimizer.step() scheduler.step() if args.use_ema: ema_model.update(model) model.zero_grad() batch_time.update(time.time() - end) end = time.time() mask_probs.update(mask.mean().item()) if not args.no_progress: p_bar.set_description( "Train Epoch: {epoch}/{epochs:4}. Iter: {batch:4}/{iter:4}. LR: {lr:.4f}. Data: {data:.3f}s. Batch: {bt:.3f}s. Loss: {loss:.4f}. Loss_x: {loss_x:.4f}. Loss_u: {loss_u:.4f}. Mask: {mask:.2f}. " .format(epoch=epoch + 1, epochs=args.epochs, batch=batch_idx + 1, iter=args.eval_step, lr=scheduler.get_last_lr()[0], data=data_time.avg, bt=batch_time.avg, loss=losses.avg, loss_x=losses_x.avg, loss_u=losses_u.avg, mask=mask_probs.avg)) p_bar.update() if not args.no_progress: p_bar.close() if args.use_ema: test_model = ema_model.ema else: test_model = model if args.local_rank in [-1, 0]: test_loss, test_acc = test(args, test_loader, test_model, epoch) args.writer.add_scalar('train/1.train_loss', losses.avg, epoch) args.writer.add_scalar('train/2.train_loss_x', losses_x.avg, epoch) args.writer.add_scalar('train/3.train_loss_u', losses_u.avg, epoch) args.writer.add_scalar('train/4.mask', mask_probs.avg, epoch) args.writer.add_scalar('test/1.test_acc', test_acc, epoch) args.writer.add_scalar('test/2.test_loss', test_loss, epoch) is_best = test_acc > best_acc best_acc = max(test_acc, best_acc) model_to_save = model.module if hasattr(model, "module") else model if args.use_ema: ema_to_save = ema_model.ema.module if hasattr( ema_model.ema, "module") else ema_model.ema save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model_to_save.state_dict(), 'ema_state_dict': ema_to_save.state_dict() if args.use_ema else None, 'acc': test_acc, 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), }, is_best, args.out) test_accs.append(test_acc) logger.info('Best top-1 acc: {:.2f}'.format(best_acc)) logger.info('Mean top-1 acc: {:.2f}\n'.format( np.mean(test_accs[-20:]))) if args.local_rank in [-1, 0]: args.writer.close()
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed(args) # Added here for reproducibility (even between python 2 and 3) for _ in train_iterator: loss_this_epoch = 0 epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): inputs, labels, attention_mask = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) inputs = inputs.to(args.device) labels = labels.to(args.device) attention_mask = attention_mask.to(args.device) model.train() outputs = model(inputs, attention_mask=attention_mask, masked_lm_labels=labels) # if args.mlm else model(inputs, labels=labels) loss = outputs[0] # model outputs are always tuple in pytorch-transformers (see doc) loss_this_epoch = loss + loss_this_epoch if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break print ('\ntrain loss epoch ... {} (not exact loss)'.format(loss_this_epoch/step)) if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train(self,num_epochs,federated_model, global_epoch): self.y_err = {'train':[], 'val':[]} self.y_loss = {'train':[], 'val':[]} self.model.load_state_dict(federated_model.state_dict()) self.model.classifier.classifier = self.classifier self.model = self.model.to(device) optimizer=get_optimizer(self.model, opt.lr) scheduler = lr_scheduler.StepLR(optimizer, step_size=40, gamma=0.1) if fp16: self.model, optimizer = amp.initialize(self.model, optimizer, opt_level = "O1") criterion = nn.CrossEntropyLoss() warm_up = 0.1 warm_iteration = round(self.datanum/opt.batchsize)*opt.warm_epoch since = time.time() print('Client', self.cid, 'start training') for epoch in range(num_epochs): print('Epoch {}/{}'.format(epoch, num_epochs - 1)) print('-' * 10) for phase in ['train', 'val']: if phase == 'train': scheduler.step() self.model.train(True) else: self.model.train(False) running_loss = 0.0 running_corrects = 0.0 for data in self.loaders[phase]: inputs, labels = data now_batch_size,c,h,w = inputs.shape if now_batch_size<opt.batchsize: continue if use_cuda: inputs = Variable(inputs.cuda().detach()) labels = Variable(labels.cuda().detach()) else: inputs, labels = Variable(inputs), Variable(labels) optimizer.zero_grad() if phase == 'val': with torch.no_grad(): outputs = self.model(inputs) else: outputs = self.model(inputs) if not opt.PCB: _, preds = torch.max(outputs.data, 1) loss = criterion(outputs, labels) else: part = {} sm = nn.Softmax(dim=1) num_part = 6 for i in range(num_part): part[i] = outputs[i] score = sm(part[0]) + sm(part[1]) +sm(part[2]) + sm(part[3]) +sm(part[4]) +sm(part[5]) _, preds = torch.max(score.data, 1) loss = criterion(part[0], labels) for i in range(num_part-1): loss += criterion(part[i+1], labels) if epoch<opt.warm_epoch and phase == 'train': warm_up = min(1.0, warm_up + 0.9 / warm_iteration) loss *= warm_up if phase == 'train': if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() if int(version[0])>0 or int(version[2]) > 3: # for the new version like 0.4.0, 0.5.0 and 1.0.0 running_loss += loss.item() * now_batch_size else : # for the old version like 0.3.0 and 0.3.1 running_loss += loss.data[0] * now_batch_size running_corrects += float(torch.sum(preds == labels.data)) epoch_loss = running_loss / (self.dataset_sizes[phase]-self.dataset_sizes[phase]%opt.batchsize) epoch_acc = running_corrects / (self.dataset_sizes[phase]-self.dataset_sizes[phase]%opt.batchsize) print('{} Loss: {:.4f} Acc: {:.4f}'.format( phase, epoch_loss, epoch_acc)) self.y_loss[phase].append(epoch_loss) self.y_err[phase].append(1.0-epoch_acc) if phase == 'val': last_model_wts = self.model.state_dict() time_elapsed = time.time() - since print('Client', self.cid, ' Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) time_elapsed = time.time() - since print('Client', self.cid, 'Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) print() self.model.load_state_dict(last_model_wts) save_network(self.model, self.cid, 'last') #self.draw_curve(self.cid, num_epochs, global_epoch, self.y_loss, self.y_err) self.classifier = self.model.classifier.classifier self.model.classifier.classifier = nn.Sequential()
def do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, run_test_func, ): logger = logging.getLogger("maskrcnn_benchmark.trainer") logger.info("Start training") meters = MetricLogger(delimiter=" ") max_iter = len(data_loader) start_iter = arguments["iteration"] model.train() start_training_time = time.time() end = time.time() best_map = 0 for iteration, (images, targets, _) in enumerate(data_loader, start_iter): data_time = time.time() - end iteration = iteration + 1 arguments["iteration"] = iteration scheduler.step() images = images.to(device) targets = [target.to(device) for target in targets] loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) meters.update(loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() # Note: If mixed precision is not used, this ends up doing nothing # Otherwise apply loss scaling for mixed-precision recipe with amp.scale_loss(losses, optimizer) as scaled_losses: scaled_losses.backward() optimizer.step() batch_time = time.time() - end meters.update(time=batch_time, data=data_time) eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) if iteration % 50 == 0 or iteration == max_iter: logger.info( meters.delimiter.join([ "eta: {eta}", "iter: {iter}", "{meters}", "lr: {lr:.6f}", "max mem: {memory:.0f}", ]).format( eta=eta_string, iter=iteration, meters=str(meters), lr=optimizer.param_groups[0]["lr"], memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, )) if iteration % checkpoint_period == 0 or iteration == max_iter: result = run_test_func(model)[0] model.train() if is_main_process() and float(result['map']) > best_map: best_map = float(result['map']) checkpointer.save( "model_{:07d}_{:.4f}".format(iteration, best_map), **arguments) end = time.time() total_training_time = time.time() - start_training_time total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / (max_iter)))
def train(train_loader, model, criterion, optimizer, epoch): batch_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() # switch to train mode model.train() end = time.time() prefetcher = data_prefetcher(train_loader) input, target = prefetcher.next() i = 0 while input is not None: i += 1 if args.prof >= 0 and i == args.prof: print("Profiling begun at iteration {}".format(i)) torch.cuda.cudart().cudaProfilerStart() if args.prof >= 0: torch.cuda.nvtx.range_push("Body of iteration {}".format(i)) adjust_learning_rate(optimizer, epoch, i, len(train_loader)) # compute output if args.prof >= 0: torch.cuda.nvtx.range_push("forward") output = model(input) if args.prof >= 0: torch.cuda.nvtx.range_pop() loss = criterion(output, target) # compute gradient and do SGD step optimizer.zero_grad() if args.prof >= 0: torch.cuda.nvtx.range_push("backward") with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() if args.prof >= 0: torch.cuda.nvtx.range_pop() # for param in model.parameters(): # print(param.data.double().sum().item(), param.grad.data.double().sum().item()) if args.prof >= 0: torch.cuda.nvtx.range_push("optimizer.step()") optimizer.step() if args.prof >= 0: torch.cuda.nvtx.range_pop() if i % args.print_freq == 0: # Every print_freq iterations, check the loss, accuracy, and speed. # For best performance, it doesn't make sense to print these metrics every # iteration, since they incur an allreduce and some host<->device syncs. # Measure accuracy prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) # Average loss and accuracy across processes for logging if args.distributed: reduced_loss = reduce_tensor(loss.data) prec1 = reduce_tensor(prec1) prec5 = reduce_tensor(prec5) else: reduced_loss = loss.data # to_python_float incurs a host<->device sync losses.update(to_python_float(reduced_loss), input.size(0)) top1.update(to_python_float(prec1), input.size(0)) top5.update(to_python_float(prec5), input.size(0)) torch.cuda.synchronize() batch_time.update((time.time() - end) / args.print_freq) end = time.time() if args.local_rank == 0: print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Speed {3:.3f} ({4:.3f})\t' 'Loss {loss.val:.10f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( epoch, i, len(train_loader), args.world_size * args.batch_size / batch_time.val, args.world_size * args.batch_size / batch_time.avg, batch_time=batch_time, loss=losses, top1=top1, top5=top5)) if args.prof >= 0: torch.cuda.nvtx.range_push("prefetcher.next()") input, target = prefetcher.next() if args.prof >= 0: torch.cuda.nvtx.range_pop() # Pop range "Body of iteration {}".format(i) if args.prof >= 0: torch.cuda.nvtx.range_pop() if args.prof >= 0 and i == args.prof + 10: print("Profiling ended at iteration {}".format(i)) torch.cuda.cudart().cudaProfilerStop() quit()
def train( num_gpus, rank, group_name, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard, ): torch.manual_seed(seed) torch.cuda.manual_seed(seed) # =====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) # =====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda() # =====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) # =====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level="O1") # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 trainset = Mel2Samp(**data_config) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader( trainset, num_workers=1, shuffle=False, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True, ) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger = SummaryWriter(os.path.join(output_directory, "logs")) # fixed for visualization real_mels, real_audios = zip(*[trainset[i] for i in range(8)]) real_mel = torch.cat(real_mels, dim=-1) real_audio = torch.cat(real_audios, dim=0) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): model.zero_grad() mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) outputs = model((mel, audio)) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() print("{}:\t{:.9f}".format(iteration, reduced_loss)) if with_tensorboard and rank == 0: step = i + len(train_loader) * epoch logger.add_scalar("training_loss", reduced_loss, step) if step % 500 == 0: # select the first eight data sample model.eval() with torch.no_grad(): device = mel.device fake_audio = (model.infer( torch.stack(real_mels).to(device)).flatten( 0, 1).cpu()) model.train() fake_mel = trainset.get_mel(fake_audio) logger.add_image( "training_mel_real", plot_spectrogram_to_numpy(real_mel), step, dataformats="HWC", ) logger.add_audio( "training_audio_real", real_audio, step, 22050, ) logger.add_image( "training_mel_fake", plot_spectrogram_to_numpy(fake_mel), step, dataformats="HWC", ) logger.add_audio( "training_audio_fake", fake_audio, step, 22050, ) logger.flush() if iteration % iters_per_checkpoint == 0: if rank == 0: checkpoint_path = "{}/waveglow_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) rank = hvd.rank() opts.rank = rank LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format(device, n_gpu, hvd.rank(), opts.fp16)) if opts.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, " "should be >= 1".format( opts.gradient_accumulation_steps)) set_random_seed(opts.seed) # train_examples = None LOGGER.info(f"Loading Train Dataset {opts.train_txt_db}, " f"{opts.train_img_db}") if "paired" in opts.model: DatasetCls = Nlvr2PairedDataset EvalDatasetCls = Nlvr2PairedEvalDataset collate_fn = nlvr2_paired_collate eval_collate_fn = nlvr2_paired_eval_collate if opts.model == "paired": ModelCls = UniterForNlvr2Paired elif opts.model == "paired-attn": ModelCls = UniterForNlvr2PairedAttn else: raise ValueError("unrecognized model type") elif opts.model == "triplet": DatasetCls = Nlvr2TripletDataset EvalDatasetCls = Nlvr2TripletEvalDataset ModelCls = UniterForNlvr2Triplet collate_fn = nlvr2_triplet_collate eval_collate_fn = nlvr2_triplet_eval_collate else: raise ValueError("unrecognized model type") # data loaders train_dataloader = create_dataloader( opts.train_img_db, opts.train_txt_db, opts.train_batch_size, True, DatasetCls, collate_fn, opts, ) val_dataloader = create_dataloader( opts.val_img_db, opts.val_txt_db, opts.val_batch_size, False, EvalDatasetCls, eval_collate_fn, opts, ) test_dataloader = create_dataloader( opts.test_img_db, opts.test_txt_db, opts.val_batch_size, False, EvalDatasetCls, eval_collate_fn, opts, ) # Prepare model if opts.checkpoint: checkpoint = torch.load(opts.checkpoint) else: checkpoint = {} model = ModelCls.from_pretrained(opts.model_config, state_dict=checkpoint, img_dim=IMG_DIM) model.init_type_embedding() model.to(device) # make sure every process has same model parameters in the beginning broadcast_tensors([p.data for p in model.parameters()], 0) set_dropout(model, opts.dropout) # Prepare optimizer optimizer = build_optimizer(model, opts) model, optimizer = amp.initialize(model, optimizer, enabled=opts.fp16, opt_level="O2") global_step = 0 if rank == 0: save_training_meta(opts) TB_LOGGER.create(join(opts.output_dir, "log")) pbar = tqdm(total=opts.num_train_steps) model_saver = ModelSaver(join(opts.output_dir, "ckpt")) os.makedirs(join(opts.output_dir, "results")) # store val predictions add_log_to_file(join(opts.output_dir, "log", "log.txt")) else: LOGGER.disabled = True pbar = NoOp() model_saver = NoOp() LOGGER.info(f"***** Running training with {n_gpu} GPUs *****") LOGGER.info(" Num examples = %d", len(train_dataloader.dataset)) LOGGER.info(" Batch size = %d", opts.train_batch_size) LOGGER.info(" Accumulate steps = %d", opts.gradient_accumulation_steps) LOGGER.info(" Num steps = %d", opts.num_train_steps) running_loss = RunningMeter("loss") model.train() n_examples = 0 n_epoch = 0 start = time() # quick hack for amp delay_unscale bug optimizer.zero_grad() optimizer.step() while True: for step, batch in enumerate(train_dataloader): targets = batch["targets"] n_examples += targets.size(0) loss = model(batch, compute_loss=True) loss = loss.mean() delay_unscale = (step + 1) % opts.gradient_accumulation_steps != 0 with amp.scale_loss(loss, optimizer, delay_unscale=delay_unscale) as scaled_loss: scaled_loss.backward() if not delay_unscale: # gather gradients from every processes # do this before unscaling to make sure every process uses # the same gradient scale grads = [ p.grad.data for p in model.parameters() if p.requires_grad and p.grad is not None ] all_reduce_and_rescale_tensors(grads, float(1)) running_loss(loss.item()) if (step + 1) % opts.gradient_accumulation_steps == 0: global_step += 1 # learning rate scheduling lr_this_step = get_lr_sched(global_step, opts) for param_group in optimizer.param_groups: param_group["lr"] = lr_this_step TB_LOGGER.add_scalar("lr", lr_this_step, global_step) # log loss # NOTE: not gathered across GPUs for efficiency TB_LOGGER.add_scalar("loss", running_loss.val, global_step) TB_LOGGER.step() # update model params if opts.grad_norm != -1: grad_norm = clip_grad_norm_(amp.master_params(optimizer), opts.grad_norm) TB_LOGGER.add_scalar("grad_norm", grad_norm, global_step) optimizer.step() optimizer.zero_grad() pbar.update(1) if global_step % 100 == 0: # monitor training throughput tot_ex = sum(all_gather_list(n_examples)) ex_per_sec = int(tot_ex / (time() - start)) LOGGER.info(f"Step {global_step}: " f"{tot_ex} examples trained at " f"{ex_per_sec} ex/s") TB_LOGGER.add_scalar("perf/ex_per_s", ex_per_sec, global_step) if global_step % opts.valid_steps == 0: for split, loader in [ ("val", val_dataloader), ("test", test_dataloader), ]: LOGGER.info(f"Step {global_step}: start running " f"validation on {split} split...") log, results = validate(model, loader, split) with open( f"{opts.output_dir}/results/" f"{split}_results_{global_step}_" f"rank{rank}.csv", "w", ) as f: for id_, ans in results: f.write(f"{id_},{ans}\n") TB_LOGGER.log_scaler_dict(log) model_saver.save(model, global_step) if global_step >= opts.num_train_steps: break if global_step >= opts.num_train_steps: break n_epoch += 1 LOGGER.info(f"Step {global_step}: finished {n_epoch} epochs") if opts.num_train_steps % opts.valid_steps != 0: for split, loader in [("val", val_dataloader), ("test", test_dataloader)]: LOGGER.info(f"Step {global_step}: start running " f"validation on {split} split...") log, results = validate(model, loader, split) with open( f"{opts.output_dir}/results/" f"{split}_results_{global_step}_" f"rank{rank}.csv", "w", ) as f: for id_, ans in results: f.write(f"{id_},{ans}\n") TB_LOGGER.log_scaler_dict(log) model_saver.save(model, global_step)
def main(): config_yaml, local_rank = parse_my_arguments() args = args_from_yaml(config_yaml) args.local_rank = local_rank # args = parse_arguments() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) device, args = setup_training(args) # Prepare optimizer model, optimizer, checkpoint, global_step = prepare_model_and_optimizer(args, device) if is_main_process(): print("SEED {}".format(args.seed)) if args.do_train: if is_main_process(): logger.info("***** Running training *****") # logger.info(" Num examples = %d", len(train_data)) logger.info(" Batch size = %d", args.train_batch_size) print(" LR = ", args.learning_rate) print("Training. . .") model.train() most_recent_ckpts_paths = [] average_loss = 0.0 # averaged loss every args.log_freq steps epoch = 0 training_steps = 0 pool = ProcessPoolExecutor(1) # Note: We loop infinitely over epochs, termination is handled via iteration count while True: if not args.resume_from_checkpoint or epoch > 0 or args.phase2: files = [os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if os.path.isfile(os.path.join(args.input_dir, f)) and 'training' in f] files.sort() num_files = len(files) random.shuffle(files) f_start_id = 0 else: f_start_id = checkpoint['files'][0] files = checkpoint['files'][1:] args.resume_from_checkpoint = False num_files = len(files) shared_file_list = {} if torch.distributed.is_initialized() and torch.distributed.get_world_size() > num_files: remainder = torch.distributed.get_world_size() % num_files data_file = files[( f_start_id * torch.distributed.get_world_size() + torch.distributed.get_rank() + remainder * f_start_id) % num_files] else: data_file = files[ (f_start_id * torch.distributed.get_world_size() + torch.distributed.get_rank()) % num_files] previous_file = data_file train_data = pretraining_dataset(data_file, args.max_predictions_per_seq) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size * args.n_gpu, num_workers=4, pin_memory=True) # shared_file_list["0"] = (train_dataloader, data_file) overflow_buf = None if args.allreduce_post_accumulation: overflow_buf = torch.cuda.IntTensor([0]) for f_id in range(f_start_id + 1, len(files)): if torch.distributed.get_world_size() > num_files: data_file = files[(f_id * torch.distributed.get_world_size() + torch.distributed.get_rank() + remainder * f_id) % num_files] else: data_file = files[ (f_id * torch.distributed.get_world_size() + torch.distributed.get_rank()) % num_files] logger.info("file no %s file %s" % (f_id, previous_file)) previous_file = data_file dataset_future = pool.submit(create_pretraining_dataset, data_file, args.max_predictions_per_seq, shared_file_list, args) train_iter = tqdm(train_dataloader, desc="Iteration") if is_main_process() else train_dataloader for step, batch in enumerate(train_iter): training_steps += 1 batch = [t.to(device) for t in batch] input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, masked_lm_labels=masked_lm_labels, next_sentence_label=next_sentence_labels, checkpoint_activations=args.checkpoint_activations) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. divisor = args.gradient_accumulation_steps if args.gradient_accumulation_steps > 1: if not args.allreduce_post_accumulation: # this division was merged into predivision loss = loss / args.gradient_accumulation_steps divisor = 1.0 if args.fp16: with amp.scale_loss(loss, optimizer, delay_overflow_check=args.allreduce_post_accumulation) as scaled_loss: scaled_loss.backward() else: loss.backward() average_loss += loss.item() if training_steps % args.gradient_accumulation_steps == 0: global_step = take_optimizer_step(args, optimizer, model, overflow_buf, global_step) if global_step >= args.max_steps: last_num_steps = int(training_steps / args.gradient_accumulation_steps) % args.log_freq last_num_steps = args.log_freq if last_num_steps == 0 else last_num_steps average_loss = torch.tensor(average_loss, dtype=torch.float32).cuda() average_loss = average_loss / (last_num_steps * divisor) if torch.distributed.is_initialized(): average_loss /= torch.distributed.get_world_size() torch.distributed.all_reduce(average_loss) if is_main_process(): logger.info("Total Steps:{} Final Loss = {}".format( training_steps / args.gradient_accumulation_steps, average_loss.item())) elif training_steps % (args.log_freq * args.gradient_accumulation_steps) == 0: if is_main_process(): print("Step:{} Average Loss = {} Step Loss = {} LR {}".format(global_step, average_loss / ( args.log_freq * divisor), loss.item() * args.gradient_accumulation_steps / divisor, optimizer.param_groups[0][ 'lr'])) average_loss = 0 if global_step >= args.max_steps or training_steps % ( args.num_steps_per_checkpoint * args.gradient_accumulation_steps) == 0: if is_main_process(): # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self if args.resume_step < 0 or not args.phase2: output_save_file = os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step)) else: output_save_file = os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step + args.phase1_end_step)) if args.do_train: torch.save({'model': model_to_save.state_dict(), 'optimizer': optimizer.state_dict(), 'master params': list(amp.master_params(optimizer)), 'files': [f_id] + files}, output_save_file) most_recent_ckpts_paths.append(output_save_file) if len(most_recent_ckpts_paths) > 3: ckpt_to_be_removed = most_recent_ckpts_paths.pop(0) os.remove(ckpt_to_be_removed) if global_step >= args.max_steps: del train_dataloader # thread.join() return args del train_dataloader # thread.join() # Make sure pool has finished and switch train_dataloader # NOTE: Will block until complete train_dataloader, data_file = dataset_future.result(timeout=None) epoch += 1
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .csv files (or other data files) for the task.") parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written.") ## Other parameters parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", default=False, action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() # choose device for training # Either no gpu, or 1 gpu if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) # args.seed is just a random seem for initization # np.random.seed(0) makes the random numbers predictable. With the seed reset (every time), the same set of numbers will appear every time. # Use random.seed() to initialize the pseudo-random number generator. random.seed(args.seed) np.random.seed(args.seed) # It will set the seed of the random number generator to a fixed value, # so that when you call for example torch.rand(2), the results will be reproducible. torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) # Sets the seed for generating random numbers on all GPUs. It’s safe to call this function if CUDA is not available; in that case, it is silently ignored. # if not run training and run eval if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") # if path exists and returns a list containing the names of the entries in the directory given by path if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) # Recursive directory creation function. # Like mkdir(), but makes all intermediate-level directories needed to contain the leaf directory. os.makedirs(args.output_dir, exist_ok=True) # Instantiate pretrained pytorch model from pre-trained model configuration. tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None # if do run training if args.do_train: train_dir = os.path.join(args.data_dir, 'train') train_examples = read_race_examples([train_dir+'/high', train_dir+'/middle']) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model # Instantiate a pretrained pytorch model from a pre-trained model configuration. model = BertForMultipleChoice.from_pretrained(args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank), num_choices=4) model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] # Add Optimizer Grouped Parameters optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] # t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() # Use 16-bit float precision if args.fp16: try: from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") # Implements Adam algorithm. # Currently GPU-only. optimizer = FusedAdam(model.parameters()) opt_level = 'O2' # amp.initialize’s optimizer argument may be a single optimizer or a list of optimizers, # as long as the output you accept has the same type. # https://nvidia.github.io/apex/advanced.html model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level) else: # Previously BertAdam optimizer was instantiated like this: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 if args.do_train: # Converts a set of InputExamples to a list of InputFeatures. # eg. convert_examples_to_features(examples, label_list, max_seq_length, tokenizer) train_features = convert_examples_to_features( train_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for ep in range(int(args.num_train_epochs)): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("Trianing Epoch: {}/{}".format(ep+1, int(args.num_train_epochs))) optimizer.zero_grad() for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear(global_step/t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if global_step%100 == 0: logger.info("Training loss: {}, global step: {}".format(tr_loss/nb_tr_steps, global_step)) ## evaluate on dev set if global_step % 1000 == 0: dev_dir = os.path.join(args.data_dir, 'dev') dev_set = [dev_dir+'/high', dev_dir+'/middle'] eval_examples = read_race_examples(dev_set) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running evaluation: Dev *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for step, batch in enumerate(eval_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = {'dev_eval_loss': eval_loss, 'dev_eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': tr_loss/nb_tr_steps} output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a+") as writer: logger.info("***** Dev results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # Save a trained model model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) ## Load a trained model that you have fine-tuned ## use this part if you want to load the trained model # model_state_dict = torch.load(output_model_file) # model = BertForMultipleChoice.from_pretrained(args.bert_model, # state_dict=model_state_dict, # num_choices=4) # model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): test_dir = os.path.join(args.data_dir, 'test') test_high = [test_dir + '/high'] test_middle = [test_dir + '/middle'] # test high eval_examples = read_race_examples(test_high) # eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running evaluation: test high *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() high_eval_loss, high_eval_accuracy = 0, 0 high_nb_eval_steps, high_nb_eval_examples = 0, 0 for step, batch in enumerate(eval_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) high_eval_loss += tmp_eval_loss.mean().item() high_eval_accuracy += tmp_eval_accuracy high_nb_eval_examples += input_ids.size(0) high_nb_eval_steps += 1 eval_loss = high_eval_loss / high_nb_eval_steps eval_accuracy = high_eval_accuracy / high_nb_eval_examples result = {'high_eval_loss': eval_loss, 'high_eval_accuracy': eval_accuracy} output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a+") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) ## test middle eval_examples = read_race_examples(test_middle) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running evaluation: test middle *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() middle_eval_loss, middle_eval_accuracy = 0, 0 middle_nb_eval_steps, middle_nb_eval_examples = 0, 0 for step, batch in enumerate(eval_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) middle_eval_loss += tmp_eval_loss.mean().item() middle_eval_accuracy += tmp_eval_accuracy middle_nb_eval_examples += input_ids.size(0) middle_nb_eval_steps += 1 eval_loss = middle_eval_loss / middle_nb_eval_steps eval_accuracy = middle_eval_accuracy / middle_nb_eval_examples result = {'middle_eval_loss': eval_loss, 'middle_eval_accuracy': eval_accuracy} with open(output_eval_file, "a+") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) ## all test eval_loss = (middle_eval_loss + high_eval_loss) / (middle_nb_eval_steps + high_nb_eval_steps) eval_accuracy = (middle_eval_accuracy + high_eval_accuracy) / (middle_nb_eval_examples + high_nb_eval_examples) result = {'overall_eval_loss': eval_loss, 'overall_eval_accuracy': eval_accuracy} with open(output_eval_file, "a+") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler_total = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) subset_quantity = args.div_subset # notice 难度划分 curriculum_sets_temp = [] # done 如何保证课程被采样了 diff_eval_result = Difficulty_Evaluation(args, train_dataset) for i,subset in enumerate(diff_eval_result): gate = int((len(train_dataset)/args.train_batch_size)/(subset_quantity)) print("第",i,"个 num:",len(subset)," 阈值 ",gate) random.shuffle(subset) # 如果subset过于小,就不采样了 if len(subset) > gate: # subset = list(subset) # 决定没一个采样的长度 curriculum_sets_temp.append(subset[0:int( gate /subset_quantity)]) # elif(len(subset) <= int(gate/subset_quantity)): # for i in range(subset_quantity): # curriculum_sets_temp.append(subset) else: curriculum_sets_temp.append(subset) # curriculum_sets_temp.append(subset) # 不采样的 # diff_eval_result = Difficulty_Evaluation(args, train_dataset) # for _ in range(int(args.num_train_epochs)): # for i, subset in enumerate(diff_eval_result): # random.shuffle(subset) # curriculum_sets_temp.append(subset) # 随机划分 # curriculum_sets_temp = Difficulty_Evaluation_Randomly(args,train_dataset) # 先添加全部任务 curriculum_sets = [] total_train_dataloader = DataLoader(train_dataset, sampler=train_sampler_total, batch_size=args.train_batch_size) for i in range(int(args.num_train_epochs)): curriculum_sets.append(total_train_dataloader) # 再添加课程任务 # notice 课程任务顺序 curriculum_sets += curriculum_sets_temp # CL阶段训练 if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(curriculum_sets[0]) // args.gradient_accumulation_steps) + 1 else: t_total = len(curriculum_sets[0]) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] # notice 添加L2正则化 optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon,weight_decay=0.01) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total ) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt") ): # Load in optimizer and scheduler states optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(curriculum_sets[0])) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 1 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(curriculum_sets[0]) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % (len(curriculum_sets[0]) // args.gradient_accumulation_steps) logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange( # epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0] epochs_trained, int(len(curriculum_sets)), desc = "Epoch", disable = args.local_rank not in [-1, 0] ) # Added here for reproductibility set_seed(args) current_stage = 0 for _ in train_iterator: epoch_iterator = tqdm(curriculum_sets[current_stage], desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) # print("batch_size",batch[0].shape) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "start_positions": batch[3], "end_positions": batch[4], } if args.model_type in ["xlm", "roberta", "distilbert", "camembert", "bart", "longformer"]: del inputs["token_type_ids"] if args.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[5], "p_mask": batch[6]}) if args.version_2_with_negative: inputs.update({"is_impossible": batch[7]}) if hasattr(model, "config") and hasattr(model.config, "lang2id"): inputs.update( {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)} ) outputs = model(**inputs) # model outputs are always tuple in transformers (see doc) loss = outputs[0] # # notice 添加KL的loss 或者 wgan的那个w # pa = 0.0001 # for i in range(args.train_batch_size): # loss += ((pa)* # ((cal_diff(x=outputs.hidden_states[0], y=outputs.hidden_states[-1], norm="line",criterion="kl")+ # cal_diff(x=outputs.hidden_states[-1], y=outputs.hidden_states[0], norm="line", criterion="kl") # )/2) # ) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 # Log metrics if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Only evaluate when single GPU otherwise metrics may not average well if args.local_rank == -1 and args.evaluate_during_training: results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss # Save model checkpoint if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) # Take care of distributed/parallel training model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break current_stage += 1 if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join( args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt")): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): # set global_step to gobal_step of last saved checkpoint from model path try: global_step = int( args.model_name_or_path.split("-")[-1].split("/")[0]) except ValueError: global_step = 0 epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed(args) # Added here for reproductibility for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3] } if args.model_type != "distilbert": inputs["token_type_ids"] = ( batch[2] if args.model_type in ["bert", "xlnet"] else None ) # XLM and RoBERTa don"t use segment_ids outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in pytorch-transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if ( args.local_rank == -1 and args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well results, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev") for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss #ADD: # use "--eval_every_epoch" insted of "--evaluate_during_training". if args.local_rank in [ -1, 0 ] and args.logging_steps == 0 and args.eval_every_epoch: if global_step % (t_total / args.num_train_epochs) == 0: results, _, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev") results['gs'] = global_step results['epochs'] = int(global_step / t_total * args.num_train_epochs) output_dev_file = os.path.join( args.output_dir, args.result_prefix + "dev_results.txt") if not os.path.exists( args.output_dir) and args.local_rank in [ -1, 0 ]: os.makedirs(args.output_dir) if not os.path.exists(output_dev_file): with open(output_dev_file, 'w') as writer: writer.write( 'Global_step,Epochs,Loss,TP,FP,FN,Prec,Rec,FB1\n' ) with open(output_dev_file, 'a') as writer: writer.write( '{gs},{epochs},{loss},{TP},{FP},{FN},{precision},{recall},{FB1}\n' .format(**results)) if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def main(_A: argparse.Namespace): if _A.num_gpus_per_machine == 0: # Set device as CPU if num_gpus_per_machine = 0. device = torch.device("cpu") else: # Get the current device as set for current distributed process. # Check `launch` function in `virtex.utils.distributed` module. device = torch.cuda.current_device() # Create a config object (this will be immutable) and perform common setup # such as logging and setting up serialization directory. _C = Config(_A.config, _A.config_override) common_setup(_C, _A) # ------------------------------------------------------------------------- # INSTANTIATE DATALOADER, MODEL, OPTIMIZER # ------------------------------------------------------------------------- tokenizer = TokenizerFactory.from_config(_C) train_dataset = PretrainingDatasetFactory.from_config(_C, split="train") val_dataset = PretrainingDatasetFactory.from_config(_C, split="val") # Make `DistributedSampler`s to shard datasets across GPU processes. # Skip this if training on CPUs. train_sampler = ( DistributedSampler(train_dataset, shuffle=True) # type: ignore if _A.num_gpus_per_machine > 0 else None) val_sampler = ( DistributedSampler(val_dataset, shuffle=False) # type: ignore if _A.num_gpus_per_machine > 0 else None) train_dataloader = DataLoader( train_dataset, batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(), sampler=train_sampler, shuffle=train_sampler is None, num_workers=_A.cpu_workers, pin_memory=True, drop_last=True, collate_fn=train_dataset.collate_fn, ) val_dataloader = DataLoader( val_dataset, batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(), sampler=val_sampler, shuffle=False, num_workers=_A.cpu_workers, pin_memory=True, drop_last=False, collate_fn=val_dataset.collate_fn, ) model = PretrainingModelFactory.from_config(_C).to(device) optimizer = OptimizerFactory.from_config(_C, model.named_parameters()) scheduler = LRSchedulerFactory.from_config(_C, optimizer) # ------------------------------------------------------------------------- # BEFORE TRAINING STARTS # ------------------------------------------------------------------------- # Load checkpoint to resume training if specified. if _A.resume_from is not None: start_iteration = CheckpointManager(model=model, optimizer=optimizer, scheduler=scheduler).load( _A.resume_from) else: start_iteration = 0 # Keep track of time per iteration and ETA. timer = Timer( start_from=start_iteration + 1, total_iterations=_C.OPTIM.NUM_ITERATIONS, ) # Create an iterator from dataloader to sample batches perpetually. train_dataloader_iter = cycle(train_dataloader, device, start_iteration) # Wrap model and optimizer using NVIDIA Apex for mixed precision training. # NOTE: Always do this before wrapping model with DistributedDataParallel. if _C.FP16_OPT > 0: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level=f"O{_C.FP16_OPT}") # Wrap model in DDP if using more than one processes. if dist.get_world_size() > 1: dist.synchronize() model = nn.parallel.DistributedDataParallel( model, device_ids=[device], find_unused_parameters=True) # Create checkpoint manager and tensorboard writer (only in master process). if dist.is_master_process(): checkpoint_manager = CheckpointManager( _A.serialization_dir, model=model, optimizer=optimizer, scheduler=scheduler, ) tensorboard_writer = SummaryWriter(log_dir=_A.serialization_dir) tensorboard_writer.add_text("config", f"```\n{_C}\n```") # ------------------------------------------------------------------------- # TRAINING LOOP # ------------------------------------------------------------------------- for iteration in range(start_iteration + 1, _C.OPTIM.NUM_ITERATIONS + 1): timer.tic() optimizer.zero_grad() batch_loss = torch.tensor(0.0, device=device) batch = next(train_dataloader_iter) output_dict = model(batch) loss = output_dict["loss"] batch_loss += loss.item() # Perform dynamic scaling of loss to adjust for mixed precision. if _C.FP16_OPT > 0: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # Clip norm of gradients before optimizer step. torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer) if _C.FP16_OPT > 0 else model.parameters(), _C.OPTIM.CLIP_GRAD_NORM, ) optimizer.step() scheduler.step(iteration) timer.toc() # --------------------------------------------------------------------- # TENSORBOARD LOGGING # --------------------------------------------------------------------- if iteration % _A.log_every == 0 and dist.is_master_process(): logger.info(f"{timer.stats} | Loss: {batch_loss:.3f} | " f"GPU mem: {dist.gpu_mem_usage()} MB") tensorboard_writer.add_scalars( "learning_rate", { "visual": optimizer.param_groups[0]["lr"], "common": optimizer.param_groups[-1]["lr"], }, iteration, ) tensorboard_writer.add_scalars("train", output_dict["loss_components"], iteration) # --------------------------------------------------------------------- # VALIDATION # --------------------------------------------------------------------- if iteration % _A.checkpoint_every == 0: if dist.is_master_process(): checkpoint_manager.step(iteration) torch.set_grad_enabled(False) model.eval() # Accumulate different val loss components according to the type of # pretraining model. val_loss_counter: Counter = Counter() for val_iteration, val_batch in enumerate(val_dataloader, start=1): for key in val_batch: val_batch[key] = val_batch[key].to(device) output_dict = model(val_batch) val_loss_counter.update(output_dict["loss_components"]) # Divide each loss component by number of val batches per GPU. val_loss_dict = { k: v / val_iteration for k, v in dict(val_loss_counter).items() } dist.average_across_processes(val_loss_dict) torch.set_grad_enabled(True) model.train() if iteration % _A.checkpoint_every == 0 and dist.is_master_process(): logger.info(f"Iter: {iteration} | Val loss: {val_loss_dict}") tensorboard_writer.add_scalars("val", val_loss_dict, iteration) # All processes will wait till master process is done logging. dist.synchronize()
def train(hyp): cfg = opt.cfg data = opt.data epochs = opt.epochs # 500200 batches at bs 64, 117263 images = 273 epochs batch_size = opt.batch_size accumulate = max(round(64 / batch_size), 1) # accumulate n times before optimizer update (bs 64) weights = opt.weights # initial training weights imgsz_min, imgsz_max, imgsz_test = opt.img_size # img sizes (min, max, test) # Image Sizes gs = 32 # (pixels) grid size assert math.fmod(imgsz_min, gs) == 0, '--img-size %g must be a %g-multiple' % (imgsz_min, gs) opt.multi_scale |= imgsz_min != imgsz_max # multi if different (min, max) if opt.multi_scale: if imgsz_min == imgsz_max: imgsz_min //= 1.5 imgsz_max //= 0.667 grid_min, grid_max = imgsz_min // gs, imgsz_max // gs imgsz_min, imgsz_max = int(grid_min * gs), int(grid_max * gs) img_size = imgsz_max # initialize with max size # Configure run init_seeds() data_dict = parse_data_cfg(data) train_path = data_dict['train'] test_path = data_dict['valid'] nc = 1 if opt.single_cls else int(data_dict['classes']) # number of classes hyp['cls'] *= nc / 80 # update coco-tuned hyp['cls'] to current dataset # Remove previous results for f in glob.glob('*_batch*.jpg') + glob.glob(results_file): os.remove(f) # Initialize model model = Darknet(cfg).to(device) # Optimizer pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in dict(model.named_parameters()).items(): if '.bias' in k: pg2 += [v] # biases elif 'Conv2d.weight' in k: pg1 += [v] # apply weight_decay else: pg0 += [v] # all else if opt.adam: # hyp['lr0'] *= 0.1 # reduce lr (i.e. SGD=5E-3, Adam=5E-4) optimizer = optim.Adam(pg0, lr=hyp['lr0']) # optimizer = AdaBound(pg0, lr=hyp['lr0'], final_lr=0.1) else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']}) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) print('Optimizer groups: %g .bias, %g Conv2d.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 start_epoch = 0 best_fitness = 0.0 attempt_download(weights) if weights.endswith('.pt'): # pytorch format # possible weights are '*.pt', 'yolov3-spp.pt', 'yolov3-tiny.pt' etc. ckpt = torch.load(weights, map_location=device) # load model try: ckpt['model'] = {k: v for k, v in ckpt['model'].items() if model.state_dict()[k].numel() == v.numel()} model.load_state_dict(ckpt['model'], strict=False) except KeyError as e: s = "%s is not compatible with %s. Specify --weights '' or specify a --cfg compatible with %s. " \ "See https://github.com/ultralytics/yolov3/issues/657" % (opt.weights, opt.cfg, opt.weights) raise KeyError(s) from e # load optimizer if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] # load results if ckpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(ckpt['training_results']) # write results.txt # epochs start_epoch = ckpt['epoch'] + 1 if epochs < start_epoch: print('%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (opt.weights, ckpt['epoch'], epochs)) epochs += ckpt['epoch'] # finetune additional epochs del ckpt elif len(weights) > 0: # darknet format # possible weights are '*.weights', 'yolov3-tiny.conv.15', 'darknet53.conv.74' etc. load_darknet_weights(model, weights) if opt.freeze_layers: output_layer_indices = [idx - 1 for idx, module in enumerate(model.module_list) if isinstance(module, YOLOLayer)] freeze_layer_indices = [x for x in range(len(model.module_list)) if (x not in output_layer_indices) and (x - 1 not in output_layer_indices)] for idx in freeze_layer_indices: for parameter in model.module_list[idx].parameters(): parameter.requires_grad_(False) # Mixed precision training https://github.com/NVIDIA/apex if mixed_precision: model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0) # Scheduler https://arxiv.org/pdf/1812.01187.pdf lf = lambda x: (((1 + math.cos(x * math.pi / epochs)) / 2) ** 1.0) * 0.95 + 0.05 # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) scheduler.last_epoch = start_epoch - 1 # see link below # https://discuss.pytorch.org/t/a-problem-occured-when-resuming-an-optimizer/28822 # Plot lr schedule # y = [] # for _ in range(epochs): # scheduler.step() # y.append(optimizer.param_groups[0]['lr']) # plt.plot(y, '.-', label='LambdaLR') # plt.xlabel('epoch') # plt.ylabel('LR') # plt.tight_layout() # plt.savefig('LR.png', dpi=300) # Initialize distributed training if device.type != 'cpu' and torch.cuda.device_count() > 1 and torch.distributed.is_available(): dist.init_process_group(backend='nccl', # 'distributed backend' init_method='tcp://127.0.0.1:9999', # distributed training init method world_size=1, # number of nodes for distributed training rank=0) # distributed training node rank model = torch.nn.parallel.DistributedDataParallel(model, find_unused_parameters=True) model.yolo_layers = model.module.yolo_layers # move yolo layer indices to top level # Dataset dataset = LoadImagesAndLabels(train_path, img_size, batch_size, augment=True, hyp=hyp, # augmentation hyperparameters rect=opt.rect, # rectangular training cache_images=opt.cache_images, single_cls=opt.single_cls) # Dataloader batch_size = min(batch_size, len(dataset)) nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, num_workers=nw, shuffle=not opt.rect, # Shuffle=True unless rectangular training is used pin_memory=True, collate_fn=dataset.collate_fn) # Testloader testloader = torch.utils.data.DataLoader(LoadImagesAndLabels(test_path, imgsz_test, batch_size, hyp=hyp, rect=True, cache_images=opt.cache_images, single_cls=opt.single_cls), batch_size=batch_size, num_workers=nw, pin_memory=True, collate_fn=dataset.collate_fn) # Model parameters model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) # attach class weights # Model EMA ema = torch_utils.ModelEMA(model) # Start training nb = len(dataloader) # number of batches n_burn = max(3 * nb, 500) # burn-in iterations, max(3 epochs, 500 iterations) maps = np.zeros(nc) # mAP per class # torch.autograd.set_detect_anomaly(True) results = (0, 0, 0, 0, 0, 0, 0) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' t0 = time.time() print('Image sizes %g - %g train, %g test' % (imgsz_min, imgsz_max, imgsz_test)) print('Using %g dataloader workers' % nw) print('Starting training for %g epochs...' % epochs) for epoch in range(start_epoch, epochs): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) if dataset.image_weights: w = model.class_weights.cpu().numpy() * (1 - maps) ** 2 # class weights image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w) dataset.indices = random.choices(range(dataset.n), weights=image_weights, k=dataset.n) # rand weighted idx mloss = torch.zeros(4).to(device) # mean losses print(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size')) pbar = tqdm(enumerate(dataloader), total=nb) # progress bar for i, (imgs, targets, paths, _) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device).float() / 255.0 # uint8 to float32, 0 - 255 to 0.0 - 1.0 targets = targets.to(device) # Burn-in if ni <= n_burn: xi = [0, n_burn] # x interp model.gr = np.interp(ni, xi, [0.0, 1.0]) # giou loss ratio (obj_loss = 1.0 or giou) accumulate = max(1, np.interp(ni, xi, [1, 64 / batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp(ni, xi, [0.1 if j == 2 else 0.0, x['initial_lr'] * lf(epoch)]) x['weight_decay'] = np.interp(ni, xi, [0.0, hyp['weight_decay'] if j == 1 else 0.0]) if 'momentum' in x: x['momentum'] = np.interp(ni, xi, [0.9, hyp['momentum']]) # Multi-Scale if opt.multi_scale: if ni / accumulate % 1 == 0: # adjust img_size (67% - 150%) every 1 batch img_size = random.randrange(grid_min, grid_max + 1) * gs sf = img_size / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]] # new shape (stretched to 32-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward pred = model(imgs) # Loss loss, loss_items = compute_loss(pred, targets, model) if not torch.isfinite(loss): print('WARNING: non-finite loss, ending training ', loss_items) return results # Backward loss *= batch_size / 64 # scale loss if mixed_precision: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # Optimize if ni % accumulate == 0: optimizer.step() optimizer.zero_grad() ema.update(model) # Print mloss = (mloss * i + loss_items) / (i + 1) # update mean losses mem = '%.3gG' % (torch.cuda.memory_cached() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.3g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem, *mloss, len(targets), img_size) pbar.set_description(s) # Plot if ni < 1: f = 'train_batch%g.jpg' % i # filename res = plot_images(images=imgs, targets=targets, paths=paths, fname=f) if tb_writer: tb_writer.add_image(f, res, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard # end batch ------------------------------------------------------------------------------------------------ # Update scheduler scheduler.step() # Process epoch results ema.update_attr(model) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP is_coco = any([x in data for x in ['coco.data', 'coco2014.data', 'coco2017.data']]) and model.nc == 80 results, maps = test.test(cfg, data, batch_size=batch_size, imgsz=imgsz_test, model=ema.ema, save_json=final_epoch and is_coco, single_cls=opt.single_cls, dataloader=testloader, multi_label=ni > n_burn) # Write with open(results_file, 'a') as f: f.write(s + '%10.3g' * 7 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) if len(opt.name) and opt.bucket: os.system('gsutil cp results.txt gs://%s/results/results%s.txt' % (opt.bucket, opt.name)) # Tensorboard if tb_writer: tags = ['train/giou_loss', 'train/obj_loss', 'train/cls_loss', 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/F1', 'val/giou_loss', 'val/obj_loss', 'val/cls_loss'] for x, tag in zip(list(mloss[:-1]) + list(results), tags): tb_writer.add_scalar(tag, x, epoch) # Update best mAP fi = fitness(np.array(results).reshape(1, -1)) # fitness_i = weighted combination of [P, R, mAP, F1] if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, 'r') as f: # create checkpoint ckpt = {'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': ema.ema.module.state_dict() if hasattr(model, 'module') else ema.ema.state_dict(), 'optimizer': None if final_epoch else optimizer.state_dict()} # Save last, best and delete torch.save(ckpt, last) if (best_fitness == fi) and not final_epoch: torch.save(ckpt, best) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training n = opt.name if len(n): n = '_' + n if not n.isnumeric() else n fresults, flast, fbest = 'results%s.txt' % n, wdir + 'last%s.pt' % n, wdir + 'best%s.pt' % n for f1, f2 in zip([wdir + 'last.pt', wdir + 'best.pt', 'results.txt'], [flast, fbest, fresults]): if os.path.exists(f1): os.rename(f1, f2) # rename ispt = f2.endswith('.pt') # is *.pt strip_optimizer(f2) if ispt else None # strip optimizer os.system('gsutil cp %s gs://%s/weights' % (f2, opt.bucket)) if opt.bucket and ispt else None # upload if not opt.evolve: plot_results() # save as results.png print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group() if torch.cuda.device_count() > 1 else None torch.cuda.empty_cache() return results
def _batch_process(self, x_batch, y_batch, lr): """ Perform the operations of FBF for a batch of data. See class documentation for more information on the exact procedure. :param x_batch: batch of x. :type x_batch: `np.ndarray` :param y_batch: batch of y. :type y_batch: `np.ndarray` :param lr: learning rate for the optimisation step. :type lr: `float` :return: `(float, float, float)` """ import torch n = x_batch.shape[0] m = np.prod(x_batch.shape[1:]) delta = random_sphere(n, m, self._eps, np.inf).reshape( x_batch.shape).astype(ART_NUMPY_DTYPE) delta_grad = self._classifier.loss_gradient(x_batch + delta, y_batch) delta = np.clip(delta + 1.25 * self._eps * np.sign(delta_grad), -self._eps, +self._eps) x_batch_pert = np.clip(x_batch + delta, self._classifier.clip_values[0], self._classifier.clip_values[1]) # Apply preprocessing x_preprocessed, y_preprocessed = self._classifier._apply_preprocessing( x_batch_pert, y_batch, fit=True) # Check label shape if self._classifier._reduce_labels: y_preprocessed = np.argmax(y_preprocessed, axis=1) i_batch = torch.from_numpy(x_preprocessed).to(self._classifier._device) o_batch = torch.from_numpy(y_preprocessed).to(self._classifier._device) # Zero the parameter gradients self._classifier._optimizer.zero_grad() # Perform prediction model_outputs = self._classifier._model(i_batch) # Form the loss function loss = self._classifier._loss(model_outputs[-1], o_batch) self._classifier._optimizer.param_groups[0].update(lr=lr) # Actual training if self._use_amp: import apex.amp as amp with amp.scale_loss(loss, self._classifier._optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # clip the gradients torch.nn.utils.clip_grad_norm_(self._classifier._model.parameters(), 0.5) self._classifier._optimizer.step() train_loss = loss.item() * o_batch.size(0) train_acc = (model_outputs[0].max(1)[1] == o_batch).sum().item() train_n = o_batch.size(0) return train_loss, train_acc, train_n
def main(): parser = argparse.ArgumentParser() parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") args = parser.parse_args() torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.device = device seed = 2001 random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.backends.cudnn.deterministic = True # prepare input import pickle with open('../process_input/splitall/image_list_train.pickle', 'rb') as f: image_list_train = pickle.load(f) with open('../process_input/splitall/image_dict.pickle', 'rb') as f: image_dict = pickle.load(f) with open('../lung_localization/splitall/bbox_dict_train.pickle', 'rb') as f: bbox_dict_train = pickle.load(f) print(len(image_list_train), len(image_dict), len(bbox_dict_train)) # hyperparameters learning_rate = 0.0004 batch_size = 32 image_size = 576 num_epoch = 1 # build model if args.local_rank != 0: torch.distributed.barrier() model = seresnext50() if args.local_rank == 0: torch.distributed.barrier() model.to(args.device) num_train_steps = int(len(image_list_train)/(batch_size*4)*num_epoch) # 4 GPUs optimizer = optim.Adam(model.parameters(), lr=learning_rate) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) model, optimizer = amp.initialize(model, optimizer, opt_level="O1",verbosity=0) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) criterion = nn.BCEWithLogitsLoss().to(args.device) # training train_transform = albumentations.Compose([ albumentations.RandomContrast(limit=0.2, p=1.0), albumentations.ShiftScaleRotate(shift_limit=0.2, scale_limit=0.2, rotate_limit=20, border_mode=cv2.BORDER_CONSTANT, p=1.0), albumentations.Cutout(num_holes=2, max_h_size=int(0.4*image_size), max_w_size=int(0.4*image_size), fill_value=0, always_apply=True, p=1.0), albumentations.Normalize(mean=(0.456, 0.456, 0.456), std=(0.224, 0.224, 0.224), max_pixel_value=255.0, p=1.0) ]) # iterator for training datagen = PEDataset(image_dict=image_dict, bbox_dict=bbox_dict_train, image_list=image_list_train, target_size=image_size, transform=train_transform) sampler = DistributedSampler(datagen) generator = DataLoader(dataset=datagen, sampler=sampler, batch_size=batch_size, num_workers=5, pin_memory=True) for ep in range(num_epoch): losses = AverageMeter() model.train() for j,(images,labels) in enumerate(generator): images = images.to(args.device) labels = labels.float().to(args.device) logits = model(images) loss = criterion(logits.view(-1),labels) losses.update(loss.item(), images.size(0)) optimizer.zero_grad() with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() optimizer.step() scheduler.step() if args.local_rank == 0: print('epoch: {}, train_loss: {}'.format(ep,losses.avg), flush=True) if args.local_rank == 0: out_dir = 'weights/' if not os.path.exists(out_dir): os.makedirs(out_dir) torch.save(model.module.state_dict(), out_dir+'epoch{}'.format(ep))
def train(output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard): torch.manual_seed(seed) torch.cuda.manual_seed(seed) criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda() optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 trainset = Mel2Samp(**data_config) train_loader = DataLoader(trainset, num_workers=1, shuffle=False, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard: from tensorboardX import SummaryWriter logger = SummaryWriter(os.path.join(output_directory, 'logs')) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): model.zero_grad() mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) outputs = model((mel, audio)) loss = criterion(outputs) reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() print("{}:\t{:.9f}".format(iteration, reduced_loss)) if with_tensorboard: logger.add_scalar('training_loss', reduced_loss, i + len(train_loader) * epoch) if (iteration % iters_per_checkpoint == 0): checkpoint_path = "{}/waveglow_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def train(hyp, tb_writer, opt, device): print(f'Hyperparameters {hyp}') log_dir = tb_writer.log_dir if tb_writer else 'runs/evolution' # run directory wdir = str(Path(log_dir) / 'weights') + os.sep # weights directory os.makedirs(wdir, exist_ok=True) last = wdir + 'last.pt' best = wdir + 'best.pt' results_file = log_dir + os.sep + 'results.txt' epochs, batch_size, total_batch_size, weights, rank = \ opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.local_rank # TODO: Init DDP logging. Only the first process is allowed to log. # Since I see lots of print here, the logging configuration is skipped here. We may see repeated outputs. # Save run settings with open(Path(log_dir) / 'hyp.yaml', 'w') as f: yaml.dump(hyp, f, sort_keys=False) with open(Path(log_dir) / 'opt.yaml', 'w') as f: yaml.dump(vars(opt), f, sort_keys=False) # Configure init_seeds(2 + rank) with open(opt.data) as f: data_dict = yaml.load(f, Loader=yaml.FullLoader) # model dict train_path = data_dict['train'] test_path = data_dict['val'] nc, names = (1, ['item']) if opt.single_cls else (int( data_dict['nc']), data_dict['names']) # number classes, names assert len(names) == nc, '%g names found for nc=%g dataset in %s' % ( len(names), nc, opt.data) # check # Remove previous results if rank in [-1, 0]: for f in glob.glob('*_batch*.jpg') + glob.glob(results_file): os.remove(f) # Create model model = Model(opt.cfg, nc=nc).to(device) # Image sizes gs = int(max(model.stride)) # grid size (max stride) imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size ] # verify imgsz are gs-multiples # Optimizer nbs = 64 # nominal batch size # default DDP implementation is slow for accumulation according to: https://pytorch.org/docs/stable/notes/ddp.html # all-reduce operation is carried out during loss.backward(). # Thus, there would be redundant all-reduce communications in a accumulation procedure, # which means, the result is still right but the training speed gets slower. # TODO: If acceleration is needed, there is an implementation of allreduce_post_accumulation # in https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/BERT/run_pretraining.py accumulate = max(round(nbs / total_batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= total_batch_size * accumulate / nbs # scale weight_decay pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in model.named_parameters(): if v.requires_grad: if '.bias' in k: pg2.append(v) # biases elif '.weight' in k and '.bn' not in k: pg1.append(v) # apply weight decay else: pg0.append(v) # all else if hyp['optimizer'] == 'adam': # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({ 'params': pg1, 'weight_decay': hyp['weight_decay'] }) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) print('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 # Load Model with torch_distributed_zero_first(rank): google_utils.attempt_download(weights) start_epoch, best_fitness = 0, 0.0 if weights.endswith('.pt'): # pytorch format ckpt = torch.load(weights, map_location=device) # load checkpoint # load model try: exclude = ['anchor'] # exclude keys ckpt['model'] = { k: v for k, v in ckpt['model'].float().state_dict().items() if k in model.state_dict() and not any(x in k for x in exclude) and model.state_dict()[k].shape == v.shape } model.load_state_dict(ckpt['model'], strict=False) print('Transferred %g/%g items from %s' % (len(ckpt['model']), len(model.state_dict()), weights)) except KeyError as e: s = "%s is not compatible with %s. This may be due to model differences or %s may be out of date. " \ "Please delete or update %s and try again, or use --weights '' to train from scratch." \ % (weights, opt.cfg, weights, weights) raise KeyError(s) from e # load optimizer if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] # load results if ckpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(ckpt['training_results']) # write results.txt # epochs start_epoch = ckpt['epoch'] + 1 if epochs < start_epoch: print( '%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (weights, ckpt['epoch'], epochs)) epochs += ckpt['epoch'] # finetune additional epochs del ckpt # Mixed precision training https://github.com/NVIDIA/apex if mixed_precision: model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0) # Scheduler https://arxiv.org/pdf/1812.01187.pdf lf = lambda x: (( (1 + math.cos(x * math.pi / epochs)) / 2)**1.0) * 0.8 + 0.2 # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # https://discuss.pytorch.org/t/a-problem-occured-when-resuming-an-optimizer/28822 # plot_lr_scheduler(optimizer, scheduler, epochs) # DP mode if device.type != 'cpu' and rank == -1 and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # SyncBatchNorm if opt.sync_bn and device.type != 'cpu' and rank != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) print('Using SyncBatchNorm()') # Exponential moving average ema = torch_utils.ModelEMA(model) if rank in [-1, 0] else None # DDP mode if device.type != 'cpu' and rank != -1: model = DDP(model, device_ids=[rank], output_device=rank) # Trainloader dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, local_rank=rank, world_size=opt.world_size) mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class nb = len(dataloader) # number of batches assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % ( mlc, nc, opt.data, nc - 1) # Testloader if rank in [-1, 0]: # local_rank is set to -1. Because only the first process is expected to do evaluation. testloader = create_dataloader(test_path, imgsz_test, total_batch_size, gs, opt, hyp=hyp, augment=False, cache=opt.cache_images, rect=True, local_rank=-1, world_size=opt.world_size)[0] # Model parameters hyp['cls'] *= nc / 80. # scale coco-tuned hyp['cls'] to current dataset model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to( device) # attach class weights model.names = names # Class frequency if rank in [-1, 0]: labels = np.concatenate(dataset.labels, 0) c = torch.tensor(labels[:, 0]) # classes # cf = torch.bincount(c.long(), minlength=nc) + 1. # model._initialize_biases(cf.to(device)) plot_labels(labels, save_dir=log_dir) if tb_writer: # tb_writer.add_hparams(hyp, {}) # causes duplicate https://github.com/ultralytics/yolov5/pull/384 tb_writer.add_histogram('classes', c, 0) # Check anchors if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) # Start training t0 = time.time() nw = max(3 * nb, 1e3) # number of warmup iterations, max(3 epochs, 1k iterations) maps = np.zeros(nc) # mAP per class results = ( 0, 0, 0, 0, 0, 0, 0 ) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' scheduler.last_epoch = start_epoch - 1 # do not move if rank in [0, -1]: print('Image sizes %g train, %g test' % (imgsz, imgsz_test)) print('Using %g dataloader workers' % dataloader.num_workers) print('Starting training for %g epochs...' % epochs) # torch.autograd.set_detect_anomaly(True) for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) # When in DDP mode, the generated indices will be broadcasted to synchronize dataset. if dataset.image_weights: # Generate indices. if rank in [-1, 0]: w = model.class_weights.cpu().numpy() * ( 1 - maps)**2 # class weights image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w) dataset.indices = random.choices( range(dataset.n), weights=image_weights, k=dataset.n) # rand weighted idx # Broadcast. if rank != -1: indices = torch.zeros([dataset.n], dtype=torch.int) if rank == 0: indices[:] = torch.from_tensor(dataset.indices, dtype=torch.int) dist.broadcast(indices, 0) if rank != 0: dataset.indices = indices.cpu().numpy() # Update mosaic border # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(4, device=device) # mean losses if rank != -1: dataloader.sampler.set_epoch(epoch) pbar = enumerate(dataloader) if rank in [-1, 0]: print( ('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size')) pbar = tqdm(pbar, total=nb) # progress bar optimizer.zero_grad() for i, ( imgs, targets, paths, _ ) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device, non_blocking=True).float( ) / 255.0 # uint8 to float32, 0 - 255 to 0.0 - 1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # model.gr = np.interp(ni, xi, [0.0, 1.0]) # giou loss ratio (obj_loss = 1.0 or giou) accumulate = max( 1, np.interp(ni, xi, [1, nbs / total_batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp( ni, xi, [0.1 if j == 2 else 0.0, x['initial_lr'] * lf(epoch)]) if 'momentum' in x: x['momentum'] = np.interp(ni, xi, [0.9, hyp['momentum']]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:] ] # new shape (stretched to gs-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward pred = model(imgs) # Loss loss, loss_items = compute_loss(pred, targets.to(device), model) # scaled by batch_size if rank != -1: loss *= opt.world_size # gradient averaged between devices in DDP mode if not torch.isfinite(loss): print('WARNING: non-finite loss, ending training ', loss_items) return results # Backward if mixed_precision: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # Optimize if ni % accumulate == 0: optimizer.step() optimizer.zero_grad() if ema is not None: ema.update(model) # Print if rank in [-1, 0]: mloss = (mloss * i + loss_items) / (i + 1 ) # update mean losses mem = '%.3gG' % (torch.cuda.memory_cached() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.4g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1]) pbar.set_description(s) # Plot if ni < 3: f = str(Path(log_dir) / ('train_batch%g.jpg' % ni)) # filename result = plot_images(images=imgs, targets=targets, paths=paths, fname=f) if tb_writer and result is not None: tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard # end batch ------------------------------------------------------------------------------------------------ # Scheduler scheduler.step() # Only the first process in DDP mode is allowed to log or save checkpoints. if rank in [-1, 0]: # mAP if ema is not None: ema.update_attr( model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride']) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP results, maps, times = test.test( opt.data, batch_size=total_batch_size, imgsz=imgsz_test, save_json=final_epoch and opt.data.endswith(os.sep + 'coco.yaml'), model=ema.ema.module if hasattr(ema.ema, 'module') else ema.ema, single_cls=opt.single_cls, dataloader=testloader, save_dir=log_dir) # Write with open(results_file, 'a') as f: f.write(s + '%10.4g' * 7 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) if len(opt.name) and opt.bucket: os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name)) # Tensorboard if tb_writer: tags = [ 'train/giou_loss', 'train/obj_loss', 'train/cls_loss', 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', 'val/giou_loss', 'val/obj_loss', 'val/cls_loss' ] for x, tag in zip(list(mloss[:-1]) + list(results), tags): tb_writer.add_scalar(tag, x, epoch) # Update best mAP fi = fitness(np.array(results).reshape( 1, -1)) # fitness_i = weighted combination of [P, R, mAP, F1] if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, 'r') as f: # create checkpoint ckpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': ema.ema.module if hasattr(ema, 'module') else ema.ema, 'optimizer': None if final_epoch else optimizer.state_dict() } # Save last, best and delete torch.save(ckpt, last) if (best_fitness == fi) and not final_epoch: torch.save(ckpt, best) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training if rank in [-1, 0]: # Strip optimizers n = ('_' if len(opt.name) and not opt.name.isnumeric() else '') + opt.name fresults, flast, fbest = 'results%s.txt' % n, wdir + 'last%s.pt' % n, wdir + 'best%s.pt' % n for f1, f2 in zip([wdir + 'last.pt', wdir + 'best.pt', 'results.txt'], [flast, fbest, fresults]): if os.path.exists(f1): os.rename(f1, f2) # rename ispt = f2.endswith('.pt') # is *.pt strip_optimizer(f2) if ispt else None # strip optimizer os.system('gsutil cp %s gs://%s/weights' % ( f2, opt.bucket)) if opt.bucket and ispt else None # upload # Finish if not opt.evolve: plot_results(save_dir=log_dir) # save as results.png print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group() if rank not in [-1, 0] else None torch.cuda.empty_cache() return results
def train( cfg, data_cfg, img_size=416, resume=False, epochs=100, # 500200 batches at bs 64, dataset length 117263 batch_size=32, accumulate=1, multi_scale=False, freeze_backbone=False, transfer=False # Transfer learning (train only YOLO layers) ): init_seeds() weights = 'weights' + os.sep latest = weights + 'latest.pt' best = weights + 'best.pt' device = torch_utils.select_device() if multi_scale: img_size = round((img_size / 32) * 1.5) * 32 # initiate with maximum multi_scale size # opt.num_workers = 0 # bug https://github.com/ultralytics/yolov3/issues/174 else: torch.backends.cudnn.benchmark = True # unsuitable for multiscale # Configure run data_dict = parse_data_cfg(data_cfg) train_path = data_dict['train'] nc = int(data_dict['classes']) # number of classes # Initialize model model = Darknet(cfg, img_size).to(device) # Optimizer optimizer = optim.SGD(model.parameters(), lr=hyp['lr0'], momentum=hyp['momentum'], weight_decay=hyp['weight_decay']) cutoff = -1 # backbone reaches to cutoff layer start_epoch = 0 best_loss = float('inf') nf = int(model.module_defs[model.yolo_layers[0] - 1]['filters']) # yolo layer size (i.e. 255) if resume: # Load previously saved model if transfer: # Transfer learning chkpt = torch.load(weights + 'yolov3.pt', map_location=device) model.load_state_dict( { k: v for k, v in chkpt['model'].items() if v.numel() > 1 and v.shape[0] != 255 }, strict=False) for p in model.parameters(): p.requires_grad = True if p.shape[0] == nf else False else: # resume from latest.pt chkpt = torch.load(latest, map_location=device) # load checkpoint model.load_state_dict(chkpt['model']) start_epoch = chkpt['epoch'] + 1 if chkpt['optimizer'] is not None: optimizer.load_state_dict(chkpt['optimizer']) best_loss = chkpt['best_loss'] del chkpt else: # Initialize model with backbone (optional) if '-tiny.cfg' in cfg: cutoff = load_darknet_weights(model, weights + 'yolov3-tiny.conv.15') else: cutoff = load_darknet_weights(model, weights + 'darknet53.conv.74') # Scheduler https://github.com/ultralytics/yolov3/issues/238 # lf = lambda x: 1 - x / epochs # linear ramp to zero # lf = lambda x: 10 ** (hyp['lrf'] * x / epochs) # exp ramp lf = lambda x: 1 - 10**(hyp['lrf'] * (1 - x / epochs)) # inverse exp ramp scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lf, last_epoch=start_epoch - 1) # scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[218, 245], gamma=0.1, last_epoch=start_epoch-1) # # Plot lr schedule # y = [] # for _ in range(epochs): # scheduler.step() # y.append(optimizer.param_groups[0]['lr']) # plt.plot(y, label='LambdaLR') # plt.xlabel('epoch') # plt.xlabel('LR') # plt.tight_layout() # plt.savefig('LR.png', dpi=300) # Dataset dataset = LoadImagesAndLabels(train_path, img_size, batch_size, augment=True, rect=False, cache=True, multi_scale=multi_scale) # Initialize distributed training if torch.cuda.device_count() > 1: dist.init_process_group(backend=opt.backend, init_method=opt.dist_url, world_size=opt.world_size, rank=opt.rank) model = torch.nn.parallel.DistributedDataParallel(model) # sampler = torch.utils.data.distributed.DistributedSampler(dataset) # Dataloader dataloader = DataLoader( dataset, batch_size=batch_size, num_workers=opt.num_workers, shuffle=False, # disable rectangular training if True pin_memory=True, collate_fn=dataset.collate_fn) # Mixed precision training https://github.com/NVIDIA/apex # install help: https://github.com/NVIDIA/apex/issues/259 mixed_precision = False if mixed_precision: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Start training model.hyp = hyp # attach hyperparameters to model model.class_weights = labels_to_class_weights(dataset.labels, nc).to( device) # attach class weights model_info(model, report='full') nb = len(dataloader) maps = np.zeros(nc) # mAP per class results = (0, 0, 0, 0, 0) # P, R, mAP, F1, test_loss n_burnin = min(round(nb / 5 + 1), 1000) # burn-in batches for f in glob.glob('train_batch*.jpg') + glob.glob('test_batch*.jpg'): os.remove(f) t, t0 = time.time(), time.time() for epoch in range(start_epoch, epochs): model.train() print( ('\n%8s%12s' + '%10s' * 7) % ('Epoch', 'Batch', 'xy', 'wh', 'conf', 'cls', 'total', 'targets', 'time')) # Update scheduler scheduler.step() # Freeze backbone at epoch 0, unfreeze at epoch 1 if freeze_backbone and epoch < 2: for name, p in model.named_parameters(): if int(name.split('.')[1]) < cutoff: # if layer < 75 p.requires_grad = False if epoch == 0 else True # Update image weights (optional) w = model.class_weights.cpu().numpy() * (1 - maps) # class weights image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w) dataset.indices = random.choices(range(dataset.n), weights=image_weights, k=dataset.n) # random weighted index mloss = torch.zeros(5).to(device) # mean losses for i, (imgs, targets, _, _) in enumerate(dataloader): imgs = imgs.to(device) targets = targets.to(device) nt = len(targets) # Plot images with bounding boxes if epoch == 0 and i == 0: plot_images(imgs=imgs, targets=targets, fname='train_batch0.jpg') # SGD burn-in if epoch == 0 and i <= n_burnin: lr = hyp['lr0'] * (i / n_burnin)**4 for x in optimizer.param_groups: x['lr'] = lr # Run model pred = model(imgs) # Compute loss loss, loss_items = compute_loss(pred, targets, model) if torch.isnan(loss): print('WARNING: nan loss detected, ending training') return results # Compute gradient if mixed_precision: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # Accumulate gradient for x batches before optimizing if (i + 1) % accumulate == 0 or (i + 1) == nb: optimizer.step() optimizer.zero_grad() # Update running mean of tracked metrics mloss = (mloss * i + loss_items) / (i + 1) # Print batch results s = ('%8s%12s' + '%10.3g' * 7) % ('%g/%g' % (epoch, epochs - 1), '%g/%g' % (i, nb - 1), *mloss, nt, time.time() - t) t = time.time() print(s) # Multi-Scale training (320 - 608 pixels) every 10 batches if multi_scale and (i + 1) % 10 == 0: dataset.img_size = random.choice(range(10, 20)) * 32 print('multi_scale img_size = %g' % dataset.img_size) # Calculate mAP (always test final epoch, skip first 5 if opt.nosave) if not (opt.notest or (opt.nosave and epoch < 10)) or epoch == epochs - 1: with torch.no_grad(): results, maps = test.test(cfg, data_cfg, batch_size=batch_size, img_size=img_size, model=model, conf_thres=0.1) # Write epoch results with open('results.txt', 'a') as file: file.write(s + '%11.3g' * 5 % results + '\n') # P, R, mAP, F1, test_loss # Update best loss test_loss = results[4] if test_loss < best_loss: best_loss = test_loss # Save training results save = (not opt.nosave) or (epoch == epochs - 1) if save: # Create checkpoint chkpt = { 'epoch': epoch, 'best_loss': best_loss, 'model': model.module.state_dict() if type(model) is nn.parallel.DistributedDataParallel else model.state_dict(), 'optimizer': optimizer.state_dict() } # Save latest checkpoint torch.save(chkpt, latest) # Save best checkpoint if best_loss == test_loss: torch.save(chkpt, best) # Save backup every 10 epochs (optional) if epoch > 0 and epoch % 10 == 0: torch.save(chkpt, weights + 'backup%g.pt' % epoch) # Delete checkpoint del chkpt dt = (time.time() - t0) / 3600 print('%g epochs completed in %.3f hours.' % (epoch - start_epoch, dt)) return results
def train(opt): opt.data_dir = Path(opt.data_dir) opt.output_dir = Path(opt.output_dir) pregenerated_data = opt.data_dir / "corpus/train" init_logger(log_file=str(opt.output_dir / "train_albert_model.log")) assert pregenerated_data.is_dir(), \ "--pregenerated_data should point to the folder of files made by prepare_lm_data_mask.py!" samples_per_epoch = 0 for i in range(opt.file_num): data_file = pregenerated_data / f"{opt.data_name}_file_{i}.json" metrics_file = pregenerated_data / f"{opt.data_name}_file_{i}_metrics.json" if data_file.is_file() and metrics_file.is_file(): metrics = json.loads(metrics_file.read_text()) samples_per_epoch += metrics['num_training_examples'] else: if i == 0: exit("No training data was found!") print( f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({opt.epochs})." ) print( "This script will loop over the available data, but training diversity may be negatively impacted." ) break logger.info(f"samples_per_epoch: {samples_per_epoch}") if opt.local_rank == -1 or opt.no_cuda: device = torch.device(f"cuda" if torch.cuda.is_available() and not opt.no_cuda else "cpu") opt.n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(opt.local_rank) device = torch.device("cuda", opt.local_rank) opt.n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( f"device: {device} , distributed training: {bool(opt.local_rank != -1)}, 16-bits training: {opt.fp16}, " ) if opt.gradient_accumulation_steps < 1: raise ValueError( f"Invalid gradient_accumulation_steps parameter: {opt.gradient_accumulation_steps}, should be >= 1" ) opt.train_batch_size = opt.train_batch_size // opt.gradient_accumulation_steps set_seed(opt.seed) tokenizer = FullTokenizer(vocab_file=opt.vocab_path, do_lower_case=opt.do_lower_case, do_cased=opt.do_cased, spm_model_file=opt.spm_model_path) total_train_examples = samples_per_epoch * opt.epochs num_train_optimization_steps = int(total_train_examples / opt.train_batch_size / opt.gradient_accumulation_steps) if opt.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) opt.warmup_steps = int(num_train_optimization_steps * opt.warmup_proportion) model = AlbertForPreTraining(config=opt) model.to(device) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': opt.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(params=optimizer_grouped_parameters, lr=opt.learning_rate, eps=opt.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=opt.warmup_steps, num_training_steps=num_train_optimization_steps) # optimizer = Lamb(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) if opt.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=opt.fp16_opt_level) if opt.n_gpu > 1: model = torch.nn.DataParallel(model) if opt.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[opt.local_rank], output_device=opt.local_rank) global_step = 0 mask_metric = LMAccuracy() sop_metric = LMAccuracy() tr_mask_acc = AverageMeter() tr_sop_acc = AverageMeter() tr_loss = AverageMeter() tr_mask_loss = AverageMeter() tr_sop_loss = AverageMeter() loss_fct = nn.CrossEntropyLoss(ignore_index=-1) train_logs = {} logger.info("***** Running training *****") logger.info(f" Num examples = {total_train_examples}") logger.info(f" Batch size = {opt.train_batch_size}") logger.info(f" Num steps = {num_train_optimization_steps}") logger.info(f" warmup_steps = {opt.warmup_steps}") start_time = time.time() set_seed(opt.seed) # Added here for reproducibility for epoch in range(opt.epochs): for idx in range(opt.file_num): epoch_dataset = PregeneratedDataset( file_id=idx, training_path=pregenerated_data, tokenizer=tokenizer, reduce_memory=opt.reduce_memory, data_name=opt.data_name) if opt.local_rank == -1: train_sampler = RandomSampler(epoch_dataset) else: train_sampler = DistributedSampler(epoch_dataset) train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=opt.train_batch_size) model.train() nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch outputs = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) prediction_scores = outputs[0] seq_relationship_score = outputs[1] masked_lm_loss = loss_fct( prediction_scores.view(-1, opt.vocab_size), lm_label_ids.view(-1)) next_sentence_loss = loss_fct( seq_relationship_score.view(-1, 2), is_next.view(-1)) loss = masked_lm_loss + next_sentence_loss mask_metric(logits=prediction_scores.view(-1, opt.vocab_size), target=lm_label_ids.view(-1)) sop_metric(logits=seq_relationship_score.view(-1, 2), target=is_next.view(-1)) if opt.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if opt.gradient_accumulation_steps > 1: loss = loss / opt.gradient_accumulation_steps if opt.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() nb_tr_steps += 1 tr_mask_acc.update(mask_metric.value(), n=input_ids.size(0)) tr_sop_acc.update(sop_metric.value(), n=input_ids.size(0)) tr_loss.update(loss.item(), n=1) tr_mask_loss.update(masked_lm_loss.item(), n=1) tr_sop_loss.update(next_sentence_loss.item(), n=1) if (step + 1) % opt.gradient_accumulation_steps == 0: if opt.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), opt.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), opt.max_grad_norm) optimizer.step() scheduler.step() optimizer.zero_grad() global_step += 1 if global_step % opt.num_eval_steps == 0: now = time.time() eta = now - start_time if eta > 3600: eta_format = ('%d:%02d:%02d' % (eta // 3600, (eta % 3600) // 60, eta % 60)) elif eta > 60: eta_format = '%d:%02d' % (eta // 60, eta % 60) else: eta_format = '%ds' % eta train_logs['loss'] = tr_loss.avg train_logs['mask_acc'] = tr_mask_acc.avg train_logs['sop_acc'] = tr_sop_acc.avg train_logs['mask_loss'] = tr_mask_loss.avg train_logs['sop_loss'] = tr_sop_loss.avg show_info = f'[Training]:[{epoch}/{opt.epochs}]{global_step}/{num_train_optimization_steps} ' \ f'- ETA: {eta_format}' + "-".join( [f' {key}: {value:.4f} ' for key, value in train_logs.items()]) logger.info(show_info) tr_mask_acc.reset() tr_sop_acc.reset() tr_loss.reset() tr_mask_loss.reset() tr_sop_loss.reset() start_time = now if global_step % opt.num_save_steps == 0: if opt.local_rank in [-1, 0] and opt.num_save_steps > 0: # Save model checkpoint output_dir = opt.output_dir / f'lm-checkpoint-{global_step}' if not output_dir.exists(): output_dir.mkdir() # save model model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(str(output_dir)) torch.save(opt, str(output_dir / 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) # save config output_config_file = output_dir / "config.json" with open(str(output_config_file), 'w') as f: f.write(model_to_save.config.to_json_string())
def train(self): if torch.cuda.is_available() and self.use_apex: from apex import amp # 加载 dataloader train_dataset, valid_dataset = self.create_dataloader() # 计算 特征维度 feature_sizes = self.compute_feature_size() # 训练 self.seed_everything() # 加载预训练模型 model = DeepFM(field_size=len(feature_sizes), feature_sizes=feature_sizes, channel_size=len(self.emb_index_dict["channel"]) + 2, embedding_size=self.embedding_size, is_shallow_dropout=self.is_shallow_dropout, dropout_shallow=self.dropout_shallow, deep_layers=self.deep_layers, is_deep_dropout=self.is_deep_dropout, dropout_deep=self.dropout_deep, deep_layers_activation=self.deep_layers_activation, is_batch_norm=self.is_batch_norm, random_seed=self.seed, use_fm=True, use_ffm=False, use_deep=True, use_cuda=True, use_plain_emb=self.use_plain_emb, use_seq=self.use_seq_emb, use_lstm=self.use_lstm, use_tcn=self.use_tcn, use_avg=self.use_avg, use_att=self.use_att, seq_emb_size=self.seq_emb_size, seq_hidden_size=self.seq_hidden_size, seq_pool=self.seq_pool, loss_func=self.loss_func) model.zero_grad() if torch.cuda.is_available(): model = model.to(self.device) epoch_steps = int(len(train_dataset) / self.batch_size) num_train_optimization_steps = int(self.epochs * epoch_steps) valid_every = math.floor(epoch_steps / 5) optimizer = torch.optim.Adam(model.parameters(), lr=self.lr, weight_decay=self.weight_decay) if self.optimizer == "sgd": # sgd lr 从1.0或者0.1开始 optimizer = torch.optim.SGD(model.parameters(), lr=self.lr, weight_decay=self.weight_decay) if torch.cuda.is_available() and self.use_apex: model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) # 开始训练 f_log = open("%s.txt" % self.log_name, "w", encoding="utf-8") best_score = -1 model.train() # TODO: LR schedule for epoch in range(self.epochs): train_start_time = time() optimizer.zero_grad() # 加载每个 batch 并训练 for i, large_batch_data in enumerate(tqdm(train_dataset)): batch_loss = 0 for batch_data in large_batch_data: if torch.cuda.is_available(): # feat: n(动态) * feat_size # seq: n(动态) * seq_len(动态) # label: n(动态) feature_list = batch_data[0].to(self.device) value_list = batch_data[1].to(self.device) channel_list = batch_data[2].to(self.device) cate_list = batch_data[3].to(self.device) products_list = batch_data[4].to(self.device) label_list = batch_data[5].to(self.device) else: feature_list = batch_data[0] value_list = batch_data[1] channel_list = batch_data[2] cate_list = batch_data[3] products_list = batch_data[4] label_list = batch_data[5] loss = model(feature_list, products_list, value_list, channel_list, label_list) if batch_loss == 0: batch_loss = loss / self.batch_size else: batch_loss += loss / self.batch_size if batch_loss == 0: continue if torch.cuda.is_available() and self.use_apex: with amp.scale_loss(batch_loss, optimizer) as scaled_loss: scaled_loss.backward() if self.use_grad_clip: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), self.max_grad) else: batch_loss.backward() if self.use_grad_clip: torch.nn.utils.clip_grad_norm_(model.parameters(), self.max_grad) optimizer.step() optimizer.zero_grad() # 开始验证 valid_start_time = time() total_label_list = [] prob_list = [] model.eval() for j, large_batch_data in enumerate(tqdm(valid_dataset)): for valid_batch_data in large_batch_data: if torch.cuda.is_available(): feature_list = valid_batch_data[0].to(self.device) value_list = valid_batch_data[1].to(self.device) channel_list = valid_batch_data[2].to(self.device) cate_list = valid_batch_data[3].to(self.device) products_list = valid_batch_data[4].to(self.device) label_list = valid_batch_data[5].to(self.device) else: feature_list = valid_batch_data[0] value_list = valid_batch_data[1] channel_list = valid_batch_data[2] cate_list = valid_batch_data[3] products_list = valid_batch_data[4] label_list = valid_batch_data[5] probs = model(feature_list, products_list, value_list, channel_list) label_list = label_list.to('cpu').detach().numpy().tolist() prob = probs.to('cpu').detach().numpy().tolist() total_label_list.append(label_list) prob_list.append(prob) macro_auc_score, micro_auc_score, acc_score = self.evaluate( prob_list, total_label_list) score = micro_auc_score print( "epoch: %d, train_duration: %d min , valid_duration: %d min " % (epoch + 1, int((valid_start_time - train_start_time) / 60), int((time() - valid_start_time) / 60))) print( "macro_auc_score: %.3f, micro_auc_score: %.3f, acc_score: %.3f " % (macro_auc_score, micro_auc_score, acc_score)) f_log.write( "epoch: %d, train_duration: %d min , valid_duration: %d min \n" % (epoch + 1, int((valid_start_time - train_start_time) / 60), int((time() - valid_start_time) / 60))) f_log.write( "macro_auc_score: %.3f, micro_auc_score: %.3f, acc_score: %.3f \n" % (macro_auc_score, micro_auc_score, acc_score)) f_log.flush() save_start_time = time() # 保存模型 if not self.debug_mode and score > best_score and self.output_model: best_score = score state_dict = model.state_dict() model_name = os.path.join( self.model_save_dir, "model_%d_%d_%d.bin" % (macro_auc_score * 100, micro_auc_score * 100, acc_score * 100)) torch.save(state_dict, model_name) print("model save duration: %d min" % int( (time() - save_start_time) / 60)) f_log.write("model save duration: %d min\n" % int( (time() - save_start_time) / 60)) f_log.flush() model.train() f_log.close()
def do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, use_amp, cfg, dllogger, args, per_iter_end_callback_fn=None, ): dllogger.log(step="PARAMETER", data={"train_start": True}) meters = MetricLogger(delimiter=" ") max_iter = len(data_loader) start_iter = arguments["iteration"] model.train() start_training_time = time.time() end = time.time() for iteration, (images, targets, _) in enumerate(data_loader, start_iter): data_time = time.time() - end iteration = iteration + 1 arguments["iteration"] = iteration images = images.to(device) targets = [target.to(device) for target in targets] loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) meters.update(loss=losses_reduced, **loss_dict_reduced) # Note: If mixed precision is not used, this ends up doing nothing # Otherwise apply loss scaling for mixed-precision recipe if use_amp: with amp.scale_loss(losses, optimizer) as scaled_losses: scaled_losses.backward() else: losses.backward() if not cfg.SOLVER.ACCUMULATE_GRAD: optimizer.step() scheduler.step() optimizer.zero_grad() else: if (iteration + 1) % cfg.SOLVER.ACCUMULATE_STEPS == 0: for param in model.parameters(): if param.grad is not None: param.grad.data.div_(cfg.SOLVER.ACCUMULATE_STEPS) optimizer.step() scheduler.step() optimizer.zero_grad() batch_time = time.time() - end end = time.time() meters.update(time=batch_time, data=data_time) eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) if iteration % 20 == 0 or iteration == max_iter: log_data = {"eta":eta_string, "learning_rate":optimizer.param_groups[0]["lr"], "memory": torch.cuda.max_memory_allocated() / 1024.0 / 1024.0 } log_data.update(meters.get_dict()) dllogger.log(step=(iteration,), data=log_data) if iteration % args.print_freq == 0 or iteration == max_iter: if is_main_process(): args.writer.add_scalar('Loss/loss', losses_reduced.item(), iteration) for k,v in loss_dict_reduced.items(): args.writer.add_scalar('Loss/'+k, v.item(), iteration) if cfg.SAVE_CHECKPOINT: if iteration % checkpoint_period == 0: checkpointer.save("model_{:07d}".format(iteration), **arguments) if iteration == max_iter: checkpointer.save("model_final", **arguments) # per-epoch work (testing) if per_iter_end_callback_fn is not None: early_exit = per_iter_end_callback_fn(iteration=iteration) if early_exit: break if args.eval_loss and iteration > 0 and iteration % args.iters_per_epoch == 0: print("Warning: this is very slow and buggy.") evaluator(cfg,args,model,device,iteration) total_training_time = time.time() - start_training_time total_time_str = str(datetime.timedelta(seconds=total_training_time)) dllogger.log(step=tuple(), data={"e2e_train_time": total_training_time, "train_perf_fps": max_iter * cfg.SOLVER.IMS_PER_BATCH / total_training_time}) logger = logging.getLogger("maskrcnn_benchmark.trainer") logger.info( "Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / (max_iter) ) )
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, rank, group_name, hparams): """Training and validation logging results to tensorboard and stdout Params ------ output_directory (string): directory to save checkpoints log_directory (string) directory to save tensorboard logs checkpoint_path(string): checkpoint path n_gpus (int): number of gpus rank (int): rank of current gpu hparams (object): comma separated list of "name=value" pairs. """ if hparams.distributed_run: init_distributed(hparams, n_gpus, rank, group_name) torch.manual_seed(hparams.seed) if device == 'cuda': torch.cuda.manual_seed(hparams.seed) model = load_model(hparams) learning_rate = hparams.learning_rate optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay) if hparams.fp16_run: from apex import amp model, optimizer = amp.initialize( model, optimizer, opt_level='O2') if hparams.distributed_run: model = apply_gradient_allreduce(model) criterion = Tacotron2Loss() logger = prepare_directories_and_logger(output_directory, log_directory, rank) train_loader, valset, collate_fn, train_sampler = prepare_dataloaders(hparams, output_directory) # Load checkpoint if one exists iteration = 0 epoch_offset = 0 if checkpoint_path is not None: if warm_start: model = warm_start_model( checkpoint_path, model, hparams.ignore_layers) else: model, optimizer, _learning_rate, iteration = load_checkpoint( checkpoint_path, model, optimizer) if hparams.use_saved_learning_rate: learning_rate = _learning_rate iteration += 1 # next iteration is iteration + 1 epoch_offset = max(0, int(iteration / len(train_loader))) model.train() is_overflow = False # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, hparams.epochs): print("Epoch: {}".format(epoch)) #if train_sampler is not None: # train_sampler.set_epoch(epoch) for i, batch in enumerate(train_loader): start = time.perf_counter() for param_group in optimizer.param_groups: param_group['lr'] = learning_rate model.zero_grad() x, y = model.parse_batch(batch) y_pred = model(x) loss = criterion(y_pred, y) if hparams.distributed_run: reduced_loss = reduce_tensor(loss.data, n_gpus).item() else: reduced_loss = loss.item() if hparams.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if hparams.fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), hparams.grad_clip_thresh) is_overflow = math.isnan(grad_norm) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hparams.grad_clip_thresh) optimizer.step() if not is_overflow and rank == 0: duration = time.perf_counter() - start print("Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format( iteration, reduced_loss, grad_norm, duration)) logger.log_training( reduced_loss, grad_norm, learning_rate, duration, iteration) if not is_overflow and (iteration % hparams.iters_per_checkpoint == 0): validate(model, criterion, valset, iteration, hparams.batch_size, n_gpus, collate_fn, logger, hparams.distributed_run, rank) if rank == 0: checkpoint_path = os.path.join( output_directory, "checkpoint_{}".format(iteration)) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def train(args, train_dataset, model): """ Train the model """ args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) t_total = len(train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=t_total * 0.1, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) if args.fp16_opt_level == "O2": keep_batchnorm_fp32 = False else: keep_batchnorm_fp32 = True model, optimizer = amp.initialize( model, optimizer, opt_level=args.fp16_opt_level, keep_batchnorm_fp32=keep_batchnorm_fp32) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = DDP( model, message_size=250000000, gradient_predivide_factor=torch.distributed.get_world_size()) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs = 0 model.zero_grad() model.train() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed( args) # Added here for reproductibility (even between python 2 and 3) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Train(XX Epoch) Step(X/X) (loss=X.X)", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): batch = tuple(t.to(args.device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions = batch outputs = model(input_ids, segment_ids, input_mask, start_positions, end_positions) loss = outputs # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel (not distributed) training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) scheduler.step() # Update learning rate schedule\ optimizer.step() optimizer.zero_grad() global_step += 1 epoch_iterator.set_description( "Train(%d Epoch) Step(%d / %d) (loss=%5.5f)" % (_, global_step, t_total, loss.item())) if args.local_rank in [-1, 0]: model_checkpoint = 'korquad_{0}_{1}_{2}_{3}.bin'.format( args.learning_rate, args.train_batch_size, epochs, int(args.num_train_epochs)) logger.info(model_checkpoint) output_model_file = os.path.join(args.output_dir, model_checkpoint) if args.n_gpu > 1 or args.local_rank != -1: logger.info("** ** * Saving file * ** **(module)") torch.save(model.module.state_dict(), output_model_file) else: logger.info("** ** * Saving file * ** **") torch.save(model.state_dict(), output_model_file) epochs += 1 logger.info("Training End!!!")
def fit(self, dl: Tuple[DataLoader, DataLoader], n_epochs, device, accum_steps, eval_steps, use_all_gpu, fp16_opt_level, max_grad_norm): self.train_loss = 0 self.device = device self.train_dl, self.valid_dl = dl n_gpu = torch.cuda.device_count() if use_all_gpu else 1 if self.train_dl.batch_size / n_gpu != int(self.train_dl.batch_size / n_gpu): raise ValueError(f"You have {n_gpu} GPUs, batch size must be divisible by {n_gpu}") if n_gpu > 1: self.model = torch.nn.DataParallel(self.model) self.model = self.model.to(self.device) if fp16_opt_level is not None: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") self.model, self.optimizer = amp.initialize(self.model, self.optimizer, opt_level=fp16_opt_level) self.n_epochs = n_epochs self.n_epoch_steps = len(self.train_dl) pb_epochs = tqdm(range(n_epochs), total=n_epochs, desc='Training') [c.on_train_start(learner=self) for c in self.callbacks] for cur_epoch in pb_epochs: self.cur_epoch = cur_epoch self.cur_epoch_step = 0 pb_epochs.set_postfix({'Epoch': f'{cur_epoch + 1}/{n_epochs}'}) pb_batches = tqdm(enumerate(self.train_dl), total=len(self.train_dl), desc='Epoch') for cur_batch, batch in pb_batches: self.cur_epoch_step += 1 inputs = utils.to_device(batch, device=device) self.model.train() _, loss = self.model(inputs) if n_gpu > 1: loss = loss.mean() loss /= accum_steps if fp16_opt_level is not None: with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() self.overall_step += 1 self.train_loss += loss.item() if self.overall_step % accum_steps == 0: if fp16_opt_level is not None: self.grad_norm = torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), max_grad_norm) else: self.grad_norm = torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_grad_norm) self.optimizer.step() [c.on_opt_step(learner=self) for c in self.callbacks] self.optimizer.zero_grad() pb_batches.set_postfix_str(self.get_log_str()) self.train_loss = 0 if (self.overall_step % eval_steps == 0) and self.valid_dl is not None: self.valid_loss = self.eval(dl=self.valid_dl) [c.on_eval_end(learner=self) for c in self.callbacks] [c.on_epoch_end(learner=self) for c in self.callbacks] [c.on_train_end(learner=self) for c in self.callbacks]
def train( self, train_dataset, output_dir, show_running_loss=True, eval_data=None, verbose=True, **kwargs, ): """ Trains the model on train_dataset. Utility function to be used by the train_model() method. Not intended to be used directly. """ model = self.model args = self.args device = self.device tb_writer = SummaryWriter(logdir=args["tensorboard_dir"]) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args["train_batch_size"]) if args["max_steps"] > 0: t_total = args["max_steps"] args["num_train_epochs"] = ( args["max_steps"] // (len(train_dataloader) // args["gradient_accumulation_steps"]) + 1) else: t_total = len(train_dataloader) // args[ "gradient_accumulation_steps"] * args["num_train_epochs"] no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args["weight_decay"], }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ] }, ] warmup_steps = math.ceil(t_total * args["warmup_ratio"]) args["warmup_steps"] = warmup_steps if args[ "warmup_steps"] == 0 else args["warmup_steps"] optimizer = AdamW(optimizer_grouped_parameters, lr=args["learning_rate"], eps=args["adam_epsilon"]) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args["warmup_steps"], num_training_steps=t_total) if (args["model_name"] and os.path.isfile( os.path.join(args["model_name"], "optimizer.pt")) and os.path.isfile( os.path.join(args["model_name"], "scheduler.pt"))): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args["model_name"], "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args["model_name"], "scheduler.pt"))) if args["fp16"]: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args["fp16_opt_level"]) if args["n_gpu"] > 1: model = torch.nn.DataParallel(model) logger.info(" Training started") global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args["num_train_epochs"]), desc="Epoch", disable=args["silent"], mininterval=0) epoch_number = 0 best_eval_metric = None early_stopping_counter = 0 steps_trained_in_current_epoch = 0 epochs_trained = 0 if args["model_name"] and os.path.exists(args["model_name"]): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args["model_name"].split("/")[-1].split( "-") if len(checkpoint_suffix) > 2: checkpoint_suffix = checkpoint_suffix[1] else: checkpoint_suffix = checkpoint_suffix[-1] global_step = int(checkpoint_suffix) epochs_trained = global_step // ( len(train_dataloader) // args["gradient_accumulation_steps"]) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args["gradient_accumulation_steps"]) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info( " Will skip the first %d steps in the current epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") if args["evaluate_during_training"]: training_progress_scores = self._create_training_progress_scores( **kwargs) if args["wandb_project"]: wandb.init(project=args["wandb_project"], config={**args}, **args["wandb_kwargs"]) wandb.watch(self.model) model.train() for current_epoch in train_iterator: if epochs_trained > 0: epochs_trained -= 1 continue # epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate( tqdm(train_dataloader, desc="Current iteration", disable=args["silent"])): if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue batch = tuple(t.to(device) for t in batch) inputs = self._get_inputs_dict(batch) outputs = model(**inputs) # model outputs are always tuple in pytorch-transformers (see doc) loss = outputs[0] if args["n_gpu"] > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training current_loss = loss.item() if show_running_loss: print("\rRunning loss: %f" % loss, end="") if args["gradient_accumulation_steps"] > 1: loss = loss / args["gradient_accumulation_steps"] if args["fp16"]: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() # torch.nn.utils.clip_grad_norm_( # amp.master_params(optimizer), args["max_grad_norm"] # ) else: loss.backward() # torch.nn.utils.clip_grad_norm_( # model.parameters(), args["max_grad_norm"] # ) tr_loss += loss.item() if (step + 1) % args["gradient_accumulation_steps"] == 0: if args["fp16"]: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args["max_grad_norm"]) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args["max_grad_norm"]) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args["logging_steps"] > 0 and global_step % args[ "logging_steps"] == 0: # Log metrics tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args["logging_steps"], global_step) logging_loss = tr_loss if args["wandb_project"]: wandb.log({ "Training loss": current_loss, "lr": scheduler.get_lr()[0], "global_step": global_step, }) if args["save_steps"] > 0 and global_step % args[ "save_steps"] == 0: # Save model checkpoint output_dir_current = os.path.join( output_dir, "checkpoint-{}".format(global_step)) self._save_model(output_dir_current, optimizer, scheduler, model=model) if args["evaluate_during_training"] and ( args["evaluate_during_training_steps"] > 0 and global_step % args["evaluate_during_training_steps"] == 0): # Only evaluate when single GPU otherwise metrics may not average well results = self.eval_model( eval_data, verbose=verbose and args["evaluate_during_training_verbose"], silent=True, **kwargs, ) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) output_dir_current = os.path.join( output_dir, "checkpoint-{}".format(global_step)) if args["save_eval_checkpoints"]: self._save_model(output_dir_current, optimizer, scheduler, model=model, results=results) training_progress_scores["global_step"].append( global_step) training_progress_scores["train_loss"].append( current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) report.to_csv( os.path.join(args["output_dir"], "training_progress_scores.csv"), index=False, ) if args["wandb_project"]: wandb.log( self._get_last_metrics( training_progress_scores)) if not best_eval_metric: best_eval_metric = results[ args["early_stopping_metric"]] self._save_model(args["best_model_dir"], optimizer, scheduler, model=model, results=results) if best_eval_metric and args[ "early_stopping_metric_minimize"]: if (results[args["early_stopping_metric"]] - best_eval_metric < args["early_stopping_delta"]): best_eval_metric = results[ args["early_stopping_metric"]] self._save_model(args["best_model_dir"], optimizer, scheduler, model=model, results=results) early_stopping_counter = 0 else: if args["use_early_stopping"]: if early_stopping_counter < args[ "early_stopping_patience"]: early_stopping_counter += 1 if verbose: logger.info( f" No improvement in {args['early_stopping_metric']}" ) logger.info( f" Current step: {early_stopping_counter}" ) logger.info( f" Early stopping patience: {args['early_stopping_patience']}" ) else: if verbose: logger.info( f" Patience of {args['early_stopping_patience']} steps reached" ) logger.info( " Training terminated.") train_iterator.close() return global_step, tr_loss / global_step else: if (results[args["early_stopping_metric"]] - best_eval_metric > args["early_stopping_delta"]): best_eval_metric = results[ args["early_stopping_metric"]] self._save_model(args["best_model_dir"], optimizer, scheduler, model=model, results=results) early_stopping_counter = 0 else: if args["use_early_stopping"]: if early_stopping_counter < args[ "early_stopping_patience"]: early_stopping_counter += 1 if verbose: logger.info( f" No improvement in {args['early_stopping_metric']}" ) logger.info( f" Current step: {early_stopping_counter}" ) logger.info( f" Early stopping patience: {args['early_stopping_patience']}" ) else: if verbose: logger.info( f" Patience of {args['early_stopping_patience']} steps reached" ) logger.info( " Training terminated.") train_iterator.close() return global_step, tr_loss / global_step epoch_number += 1 output_dir_current = os.path.join( output_dir, "checkpoint-{}-epoch-{}".format(global_step, epoch_number)) if args["save_model_every_epoch"] or args[ "evaluate_during_training"]: os.makedirs(output_dir_current, exist_ok=True) if args["save_model_every_epoch"]: self._save_model(output_dir_current, optimizer, scheduler, model=model) if args["evaluate_during_training"]: results = self.eval_model( eval_data, verbose=verbose and args["evaluate_during_training_verbose"], silent=True, **kwargs) self._save_model(output_dir_current, optimizer, scheduler, results=results) training_progress_scores["global_step"].append(global_step) training_progress_scores["train_loss"].append(current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) report.to_csv(os.path.join(args["output_dir"], "training_progress_scores.csv"), index=False) if args["wandb_project"]: wandb.log(self._get_last_metrics(training_progress_scores)) if not best_eval_metric: best_eval_metric = results[args["early_stopping_metric"]] self._save_model(args["best_model_dir"], optimizer, scheduler, model=model, results=results) if best_eval_metric and args["early_stopping_metric_minimize"]: if results[args[ "early_stopping_metric"]] - best_eval_metric < args[ "early_stopping_delta"]: best_eval_metric = results[ args["early_stopping_metric"]] self._save_model(args["best_model_dir"], optimizer, scheduler, model=model, results=results) early_stopping_counter = 0 else: if args["use_early_stopping"] and args[ "early_stopping_consider_epochs"]: if early_stopping_counter < args[ "early_stopping_patience"]: early_stopping_counter += 1 if verbose: logger.info( f" No improvement in {args['early_stopping_metric']}" ) logger.info( f" Current step: {early_stopping_counter}" ) logger.info( f" Early stopping patience: {args['early_stopping_patience']}" ) else: if verbose: logger.info( f" Patience of {args['early_stopping_patience']} steps reached" ) logger.info(" Training terminated.") train_iterator.close() return global_step, tr_loss / global_step else: if results[args[ "early_stopping_metric"]] - best_eval_metric > args[ "early_stopping_delta"]: best_eval_metric = results[ args["early_stopping_metric"]] self._save_model(args["best_model_dir"], optimizer, scheduler, model=model, results=results) early_stopping_counter = 0 else: if args["use_early_stopping"] and args[ "early_stopping_consider_epochs"]: if early_stopping_counter < args[ "early_stopping_patience"]: early_stopping_counter += 1 if verbose: logger.info( f" No improvement in {args['early_stopping_metric']}" ) logger.info( f" Current step: {early_stopping_counter}" ) logger.info( f" Early stopping patience: {args['early_stopping_patience']}" ) else: if verbose: logger.info( f" Patience of {args['early_stopping_patience']} steps reached" ) logger.info(" Training terminated.") train_iterator.close() return global_step, tr_loss / global_step return global_step, tr_loss / global_step
# sum per device losses losses = [ torch.mean(x) if not isinstance(x, int) else x for x in losses ] loss_dict = dict(zip(model.module.loss_names, losses)) # calculate final loss scalar loss_D = (loss_dict['D_fake'] + loss_dict['D_real']) * 0.5 loss_G = loss_dict['G_GAN'] + loss_dict.get( 'G_GAN_Feat', 0) + loss_dict.get('G_VGG', 0) ############### Backward Pass #################### # update generator weights optimizer_G.zero_grad() if opt.fp16: with amp.scale_loss(loss_G, optimizer_G) as scaled_loss: scaled_loss.backward() else: loss_G.backward() optimizer_G.step() # update discriminator weights optimizer_D.zero_grad() if opt.fp16: with amp.scale_loss(loss_D, optimizer_D) as scaled_loss: scaled_loss.backward() else: loss_D.backward() optimizer_D.step() ############## Display results and errors ##########
def train_one_epoch(epoch, model, loader, criterion, optimizer, db): loss_meter = AverageMeter() lr = optimizer.state_dict()['param_groups'][0]['lr'] get_logger().info('[Start] epoch: %d' % epoch) get_logger().info('lr: %f' % lr) # update dataset loader.dataset.update() if epoch < config.FREEZE_EPOCH: get_logger().info('freeze model parameter') # freeze pretrained layers for name, child in model.named_children(): if name in ['feature']: for param in child.parameters(): param.requires_grad = False elif epoch == config.FREEZE_EPOCH: get_logger().info('unfreeze model parameter') for name, child in model.named_children(): for param in child.parameters(): param.requires_grad = True # free last Linear layer # for param in model.last_layer[-1].parameters(): # print('freeze last layer') # param.requires_grad = False # train phase model.train() for i, data in enumerate(tqdm(loader)): img, label = data img = img.to(config.DEVICE, dtype=torch.float) label = label.to(config.DEVICE, dtype=torch.float) with torch.set_grad_enabled(True): # mixed_img, label_a, label_b, lam = mixup_data(img, label, 1.0) logit = model(img) # print(logit.size()) loss = criterion(logit, label) # loss = mixup_criterion(criterion, logit, label_a, label_b, lam) # backward with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() # loss.backward() if config.VIZ_GRAD and i % config.PRINT_FREQ == 0: # visualize grad # plot_grad_flow(model.named_parameters()) db.rec_grad(model, epoch, i) optimizer.step() optimizer.zero_grad() loss_meter.update(loss.item(), img.size(0)) # print if i % config.PRINT_FREQ == 0: logit_cpu = logit.detach().cpu() get_logger().info('\n' + str_stats(logit_cpu[0].numpy())) prob = torch.sigmoid(logit_cpu) get_logger().info('\n' + str_stats(prob[0].numpy())) get_logger().info('train: %d loss: %f (just now)' % (i, loss_meter.val)) get_logger().info('train: %d loss: %f' % (i, loss_meter.avg)) db.rec_history(epoch, i, 'train', img.size( 0), lr, loss_meter.avg, loss_meter.avg) get_logger().info("Epoch %d/%d train loss %f" % (epoch, config.EPOCHS, loss_meter.avg)) return loss_meter.avg
def deeplab_training(model_name, model_type, optimizer_name, lr_scheduler_name, lr, batch_size, valid_batch_size, num_epoch, start_epoch, accumulation_steps, train_data_folder, checkpoint_folder, load_pretrain): COMMON_STRING = '@%s: \n' % os.path.basename(__file__) COMMON_STRING += '\tset random seed\n' COMMON_STRING += '\t\tSEED = %d\n' % SEED torch.backends.cudnn.benchmark = False ##uses the inbuilt cudnn auto-tuner to find the fastest convolution algorithms. - torch.backends.cudnn.enabled = True torch.backends.cudnn.deterministic = True COMMON_STRING += '\tset cuda environment\n' COMMON_STRING += '\t\ttorch.__version__ = %s\n' % torch.__version__ COMMON_STRING += '\t\ttorch.version.cuda = %s\n' % torch.version.cuda COMMON_STRING += '\t\ttorch.backends.cudnn.version() = %s\n' % torch.backends.cudnn.version( ) try: COMMON_STRING += '\t\tos[\'CUDA_VISIBLE_DEVICES\'] = %s\n' % os.environ[ 'CUDA_VISIBLE_DEVICES'] NUM_CUDA_DEVICES = len(os.environ['CUDA_VISIBLE_DEVICES'].split(',')) except Exception: COMMON_STRING += '\t\tos[\'CUDA_VISIBLE_DEVICES\'] = None\n' NUM_CUDA_DEVICES = 1 COMMON_STRING += '\t\ttorch.cuda.device_count() = %d\n' % torch.cuda.device_count( ) COMMON_STRING += '\n' if not os.path.exists(checkpoint_folder + '/' + model_type + '/' + model_name): os.mkdir(checkpoint_folder + '/' + model_type + '/' + model_name) log = Logger() log.open(checkpoint_folder + '/' + model_type + '/' + model_name + '/' + model_name + '_log_train.txt', mode='a+') log.write('\t%s\n' % COMMON_STRING) log.write('\n') log.write('\tSEED = %u\n' % SEED) log.write('\tPROJECT_PATH = %s\n' % train_data_folder) log.write('\t__file__ = %s\n' % __file__) log.write('\tout_dir = %s\n' % checkpoint_folder) log.write('\n') ## dataset ---------------------------------------- log.write('** dataset setting **\n') train_dataset = CloudDataset( data_dir=train_data_folder, mode='train', csv=[ 'train.csv', ], split=[ 'by_random1/train_fold_a0_5246.npy', ], augment=transform_train, ) train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size, drop_last=True, num_workers=4, pin_memory=True, collate_fn=null_collate) valid_dataset = CloudDataset( data_dir=train_data_folder, mode='train', csv=[ 'train.csv', ], split=[ 'by_random1/valid_fold_a0_300.npy', ], #split = ['by_random1/valid_small_fold_a0_120.npy',], augment=transform_valid, ) valid_dataloader = DataLoader(valid_dataset, sampler=SequentialSampler(valid_dataset), batch_size=valid_batch_size, drop_last=False, num_workers=4, pin_memory=True, collate_fn=null_collate) log.write('train_dataset : \n%s\n' % (train_dataset)) log.write('valid_dataset : \n%s\n' % (valid_dataset)) log.write('\n') ############################################################################## define unet model with backbone MASK_WIDTH = 525 MASK_HEIGHT = 350 def get_model(model_name="deep_se101", in_channel=6, num_classes=1, criterion=SoftDiceLoss_binary()): if model_name == 'deep_se50': from semantic_segmentation.network.deepv3 import DeepSRNX50V3PlusD_m1 # r model = DeepSRNX50V3PlusD_m1(in_channel=6, num_classes=num_classes, criterion=SoftDiceLoss_binary()) elif model_name == 'deep_se101': from semantic_segmentation.network.deepv3 import DeepSRNX101V3PlusD_m1 # r model = DeepSRNX101V3PlusD_m1(in_channel=6, num_classes=num_classes, criterion=SoftDiceLoss_binary()) elif model_name == 'WideResnet38': from semantic_segmentation.network.deepv3 import DeepWR38V3PlusD_m1 # r model = DeepWR38V3PlusD_m1(in_channel=6, num_classes=num_classes, criterion=SoftDiceLoss_binary()) elif model_name == 'unet_ef3': from ef_unet import EfficientNet_3_unet model = EfficientNet_3_unet() elif model_name == 'unet_ef5': from ef_unet import EfficientNet_5_unet model = EfficientNet_5_unet() else: print('No model name in it') model = None return model ############################################################################### training parameters checkpoint_filename = model_type + '/' + model_name + '/' + model_name + "_" + model_type + "_deeplab_checkpoint.pth" checkpoint_filepath = os.path.join(checkpoint_folder, checkpoint_filename) ############################################################################### model and optimizer model = get_model(model_name=model_name, in_channel=3, num_classes=len(CLASSNAME_TO_CLASSNO), criterion=SoftDiceLoss_binary()) if (load_pretrain): model = load(model, checkpoint_filepath) model = model.cuda() if optimizer_name == "Adam": optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) elif optimizer_name == "adamonecycle": flatten_model = lambda m: sum(map(flatten_model, m.children()), [] ) if num_children(m) else [m] get_layer_groups = lambda m: [nn.Sequential(*flatten_model(m))] optimizer_func = partial(optim.Adam, betas=(0.9, 0.99)) optimizer = OptimWrapper.create(optimizer_func, 3e-3, get_layer_groups(model), wd=1e-4, true_wd=True, bn_wd=True) elif optimizer_name == "Ranger": optimizer = Ranger(filter(lambda p: p.requires_grad, model.parameters()), lr, weight_decay=1e-5) else: raise NotImplementedError if lr_scheduler_name == "adamonecycle": scheduler = lsf.OneCycle(optimizer, len(train_dataset) * num_epoch, lr, [0.95, 0.85], 10.0, 0.4) lr_scheduler_each_iter = True elif lr_scheduler_name == "CosineAnealing": scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, num_epoch, eta_min=0, last_epoch=-1) lr_scheduler_each_iter = False elif lr_scheduler_name == "WarmRestart": scheduler = WarmRestart(optimizer, T_max=5, T_mult=1, eta_min=1e-6) lr_scheduler_each_iter = False else: raise NotImplementedError log.write('net\n %s\n' % (model_name)) log.write('optimizer\n %s\n' % (optimizer_name)) log.write('schduler\n %s\n' % (lr_scheduler_name)) log.write('\n') # mix precision model, optimizer = amp.initialize(model, optimizer, opt_level="O1") ############################################################################### training log.write('** start training here! **\n') log.write(' batch_size=%d, accumulation_steps=%d\n' % (batch_size, accumulation_steps)) log.write(' experiment = %s\n' % str(__file__.split('/')[-2:])) valid_loss = np.zeros(17, np.float32) train_loss = np.zeros(6, np.float32) valid_metric_optimal = np.inf eval_step = len(train_dataloader) # or len(train_dataloader) log_step = 100 eval_count = 0 # define tensorboard writer and timer writer = SummaryWriter() start_timer = timer() for epoch in range(1, num_epoch + 1): # update lr and start from start_epoch if (not lr_scheduler_each_iter): if epoch < 6: if epoch != 0: scheduler.step() scheduler = warm_restart(scheduler, T_mult=2) elif epoch > 5 and epoch < 7: optimizer.param_groups[0]['lr'] = 1e-5 else: optimizer.param_groups[0]['lr'] = 5e-6 if (epoch < start_epoch): continue log.write("Epoch%s\n" % epoch) log.write('\n') for param_group in optimizer.param_groups: rate = param_group['lr'] sum_train_loss = np.zeros_like(train_loss) sum_train = np.zeros_like(train_loss) seed_everything(SEED + epoch) torch.cuda.empty_cache() optimizer.zero_grad() for tr_batch_i, (X, truth_label, truth_mask, infor) in enumerate(train_dataloader): if (lr_scheduler_each_iter): scheduler.step(tr_batch_i) model.train() X = X.cuda().float() truth_label = truth_label.cuda() truth_mask = truth_mask.cuda() prediction = model(X) # [N, C, H, W] loss = SoftDiceLoss_binary()(prediction, truth_mask) + \ criterion_mask(prediction, truth_mask, weight=None) with amp.scale_loss(loss / accumulation_steps, optimizer) as scaled_loss: scaled_loss.backward() #loss.backward() if ((tr_batch_i + 1) % accumulation_steps == 0): torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0, norm_type=2) optimizer.step() optimizer.zero_grad() writer.add_scalar( 'train_loss', loss.item(), (epoch - 1) * len(train_dataloader) * batch_size + tr_batch_i * batch_size) # print statistics -------- probability_mask = torch.sigmoid(prediction) probability_label = probability_mask_to_label(probability_mask) # tn, tp, num_neg, num_pos = metric_label(probability_label, truth_label) dn, dp, num_neg, num_pos = metric_mask(probability_mask, truth_mask) l = np.array([loss.item() * batch_size, dn.sum(), *dp]) n = np.array([batch_size, num_neg.sum(), *num_pos]) sum_train_loss += l sum_train += n # log for training if (tr_batch_i + 1) % log_step == 0: train_loss = sum_train_loss / (sum_train + 1e-12) sum_train_loss[...] = 0 sum_train[...] = 0 log.write('lr: %f train loss: %f dn: %f dp1: %f dp2: %f dp3: %f dp4: %f\n' % \ (rate, train_loss[0], train_loss[1], train_loss[2], train_loss[3], train_loss[4], train_loss[5])) if (tr_batch_i + 1) % eval_step == 0: eval_count += 1 valid_loss = np.zeros(17, np.float32) valid_num = np.zeros_like(valid_loss) valid_metric = [] with torch.no_grad(): torch.cuda.empty_cache() for val_batch_i, (X, truth_label, truth_mask, infor) in enumerate(valid_dataloader): model.eval() X = X.cuda().float() truth_label = truth_label.cuda() truth_mask = truth_mask.cuda() prediction = model(X) # [N, C, H, W] loss = SoftDiceLoss_binary()(prediction, truth_mask) + \ criterion_mask(prediction, truth_mask, weight=None) writer.add_scalar( 'val_loss', loss.item(), (eval_count - 1) * len(valid_dataloader) * valid_batch_size + val_batch_i * valid_batch_size) # print statistics -------- probability_mask = torch.sigmoid(prediction) probability_label = probability_mask_to_label( probability_mask) tn, tp, _, _ = metric_label(probability_label, truth_label) dn, dp, num_neg, num_pos = metric_mask( probability_mask, truth_mask) #--- l = np.array([ loss.item() * valid_batch_size, *tn, *tp, *dn, *dp ]) n = np.array([ valid_batch_size, *num_neg, *num_pos, *num_neg, *num_pos ]) valid_loss += l valid_num += n valid_loss = valid_loss / valid_num #------ test_pos_ratio = np.array([ NUM_TEST_POS[c][0] / NUM_TEST for c in list(CLASSNAME_TO_CLASSNO.keys()) ]) test_neg_ratio = 1 - test_pos_ratio tn, tp, dn, dp = valid_loss[1:].reshape(-1, NUM_CLASS) kaggle = test_neg_ratio * tn + test_neg_ratio * ( 1 - tn) * dn + test_pos_ratio * tp * dp kaggle = kaggle.mean() kaggle1 = test_neg_ratio * tn + test_pos_ratio * tp kaggle1 = kaggle1.mean() log.write('kaggle value: %f validation loss: %f tn1: %f tn2: %f tn3: %f tn4: %f tp1: %f tp2: %f tp3: %f tp4: %f dn1: %f dn2: %f dn3: %f dn4: %f dp1: %f dp2: %f dp3: %f dp4: %f\n' % \ (kaggle1, valid_loss[0], \ valid_loss[1], valid_loss[2], valid_loss[3], valid_loss[4], \ valid_loss[5], valid_loss[6], valid_loss[7], valid_loss[8], \ valid_loss[9], valid_loss[10], valid_loss[11], valid_loss[12], \ valid_loss[13], valid_loss[14], valid_loss[15], valid_loss[16])) val_metric_epoch = valid_loss[0] if (val_metric_epoch <= valid_metric_optimal): log.write('Validation metric improved ({:.6f} --> {:.6f}). Saving model ...'.format(\ valid_metric_optimal, val_metric_epoch)) valid_metric_optimal = val_metric_epoch torch.save(model.state_dict(), checkpoint_filepath)
def get_images(self, net_student=None, targets=None): print("get_images call") net_teacher = self.net_teacher use_fp16 = self.use_fp16 save_every = self.save_every kl_loss = nn.KLDivLoss(reduction='batchmean').cuda() local_rank = torch.cuda.current_device() best_cost = 1e4 criterion = self.criterion # setup target labels if targets is None: #only works for classification now, for other tasks need to provide target vector targets = torch.LongTensor( [random.randint(0, 999) for _ in range(self.bs)]).to('cuda') if not self.random_label: # preselected classes, good for ResNet50v1.5 targets = [ 1, 933, 946, 980, 25, 63, 92, 94, 107, 985, 151, 154, 207, 250, 270, 277, 283, 292, 294, 309, 311, 325, 340, 360, 386, 402, 403, 409, 530, 440, 468, 417, 590, 670, 817, 762, 920, 949, 963, 967, 574, 487 ] targets = torch.LongTensor( targets * (int(self.bs / len(targets)))).to('cuda') img_original = self.image_resolution data_type = torch.half if use_fp16 else torch.float inputs = torch.randn((self.bs, 3, img_original, img_original), requires_grad=True, device='cuda', dtype=data_type) pooling_function = nn.modules.pooling.AvgPool2d(kernel_size=2) if self.setting_id == 0: skipfirst = False else: skipfirst = True iteration = 0 for lr_it, lower_res in enumerate([2, 1]): if lr_it == 0: iterations_per_layer = 2000 else: iterations_per_layer = 1000 if not skipfirst else 2000 if self.setting_id == 2: iterations_per_layer = 20000 if lr_it == 0 and skipfirst: continue lim_0, lim_1 = self.jitter // lower_res, self.jitter // lower_res if self.setting_id == 0: #multi resolution, 2k iterations with low resolution, 1k at normal, ResNet50v1.5 works the best, ResNet50 is ok optimizer = optim.Adam([inputs], lr=self.lr, betas=[0.5, 0.9], eps=1e-8) do_clip = True elif self.setting_id == 1: #2k normal resolultion, for ResNet50v1.5; Resnet50 works as well optimizer = optim.Adam([inputs], lr=self.lr, betas=[0.5, 0.9], eps=1e-8) do_clip = True elif self.setting_id == 2: #20k normal resolution the closes to the paper experiments for ResNet50 optimizer = optim.Adam([inputs], lr=self.lr, betas=[0.9, 0.999], eps=1e-8) do_clip = False if use_fp16: static_loss_scale = 256 static_loss_scale = "dynamic" _, optimizer = amp.initialize([], optimizer, opt_level="O2", loss_scale=static_loss_scale) lr_scheduler = lr_cosine_policy(self.lr, 100, iterations_per_layer) for iteration_loc in range(iterations_per_layer): iteration += 1 # learning rate scheduling lr_scheduler(optimizer, iteration_loc, iteration_loc) # perform downsampling if needed if lower_res != 1: inputs_jit = pooling_function(inputs) else: inputs_jit = inputs # apply random jitter offsets off1 = random.randint(-lim_0, lim_0) off2 = random.randint(-lim_1, lim_1) inputs_jit = torch.roll(inputs_jit, shifts=(off1, off2), dims=(2, 3)) # Flipping flip = random.random() > 0.5 if flip and self.do_flip: inputs_jit = torch.flip(inputs_jit, dims=(3, )) # forward pass optimizer.zero_grad() net_teacher.zero_grad() outputs = net_teacher(inputs_jit) outputs = self.network_output_function(outputs) # R_cross classification loss loss = criterion(outputs, targets) # R_prior losses loss_var_l1, loss_var_l2 = get_image_prior_losses(inputs_jit) # R_feature loss rescale = [self.first_bn_multiplier] + [ 1. for _ in range(len(self.loss_r_feature_layers) - 1) ] loss_r_feature = sum([ mod.r_feature * rescale[idx] for (idx, mod) in enumerate(self.loss_r_feature_layers) ]) # R_ADI loss_verifier_cig = torch.zeros(1) if self.adi_scale != 0.0: if self.detach_student: outputs_student = net_student(inputs_jit).detach() else: outputs_student = net_student(inputs_jit) T = 3.0 if 1: T = 3.0 # Jensen Shanon divergence: # another way to force KL between negative probabilities P = nn.functional.softmax(outputs_student / T, dim=1) Q = nn.functional.softmax(outputs / T, dim=1) M = 0.5 * (P + Q) P = torch.clamp(P, 0.01, 0.99) Q = torch.clamp(Q, 0.01, 0.99) M = torch.clamp(M, 0.01, 0.99) eps = 0.0 loss_verifier_cig = 0.5 * kl_loss( torch.log(P + eps), M) + 0.5 * kl_loss( torch.log(Q + eps), M) # JS criteria - 0 means full correlation, 1 - means completely different loss_verifier_cig = 1.0 - torch.clamp( loss_verifier_cig, 0.0, 1.0) if local_rank == 0: if iteration % save_every == 0: print('loss_verifier_cig', loss_verifier_cig.item()) # l2 loss on images loss_l2 = torch.norm(inputs_jit.view(self.bs, -1), dim=1).mean() # combining losses loss_aux = self.var_scale_l2 * loss_var_l2 + \ self.var_scale_l1 * loss_var_l1 + \ self.bn_reg_scale * loss_r_feature + \ self.l2_scale * loss_l2 if self.adi_scale != 0.0: loss_aux += self.adi_scale * loss_verifier_cig loss = self.main_loss_multiplier * loss + loss_aux if local_rank == 0: if iteration % save_every == 0: print("------------iteration {}----------".format( iteration)) print("total loss", loss.item()) print("loss_r_feature", loss_r_feature.item()) print("main criterion", criterion(outputs, targets).item()) if self.hook_for_display is not None: self.hook_for_display(inputs, targets) # do image update if use_fp16: # optimizer.backward(loss) with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() # clip color outlayers if do_clip: inputs.data = clip(inputs.data, use_fp16=use_fp16) if best_cost > loss.item() or iteration == 1: best_inputs = inputs.data.clone() if iteration % save_every == 0 and (save_every > 0): if local_rank == 0: vutils.save_image( inputs, '{}/best_images/output_{:05d}_gpu_{}.png'.format( self.prefix, iteration // save_every, local_rank), normalize=True, scale_each=True, nrow=int(10)) if self.store_best_images: best_inputs = denormalize(best_inputs) self.save_images(best_inputs, targets) # to reduce memory consumption by states of the optimizer we deallocate memory optimizer.state = collections.defaultdict(dict)