def train_epoch(loader, model, optimizer, device, tag=''): """Trainning model ...""" total_loss = Counter() model.train() with tqdm(total=len(loader.dataset)) as t: t.set_description(tag) for data in loader: images, targets = data count = len(images) # Transform data to device images = images.to(device) targets = targets.to(device) predicts = model(images) # xxxx--modify here loss = nn.L1Loss(predicts, targets) loss_value = loss.item() if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) sys.exit(1) # Update loss total_loss.update(loss_value, count) t.set_postfix(loss='{:.6f}'.format(total_loss.avg)) t.update(count) # Optimizer optimizer.zero_grad() if os.environ["ENABLE_APEX"] == "YES": from apex import amp with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() return total_loss.avg
def train_epoch(loader, model, optimizer, device, tag=''): """Trainning model ...""" total_loss = Counter() model.train() criterion = nn.L1Loss() with tqdm(total=len(loader.dataset)) as t: t.set_description(tag) for data in loader: images, targets = data count = len(images) # Transform data to device images = images.to(device) targets = targets.to(device) predicts = model(images) loss = criterion(predicts, targets) loss_value = loss.item() del images, targets, predicts torch.cuda.empty_cache() if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) sys.exit(1) # Update loss total_loss.update(loss_value, count) t.set_postfix(loss='L1Loss: {:.6f}'.format(total_loss.avg)) t.update(count) # Optimizer optimizer.zero_grad() loss.backward() optimizer.step() del loss torch.cuda.empty_cache() return total_loss.avg
def train_epoch(loader, model, optimizer, model_d, device, tag=''): """Trainning model ...""" total_loss = Counter() model.train() with tqdm(total=len(loader.dataset)) as t: t.set_description(tag) for data in loader: images, masks = data count = len(images) # Transform data to device images = images.to(device) masks = masks.to(device) GT = images new_images, new_masks = image_with_mask(images, masks) fake_images = model(new_images, new_masks) G_loss = model_d(new_images[:, 0:3, :, :], new_masks, fake_images, GT) loss = G_loss.sum() loss_value = loss.item() if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) sys.exit(1) # Update loss total_loss.update(loss_value, count) t.set_postfix(loss='{:.6f}'.format(total_loss.avg)) t.update(count) # Optimizer optimizer.zero_grad() loss.backward() optimizer.step() return total_loss.avg
def distill(args, output_model_file, processor, label_list, tokenizer, device, n_gpu, tensorboard_logger, eval_data=None): assert args.kd_policy is not None model = args.kd_policy.student args.kd_policy.teacher.eval() num_labels = len(args.labels) global_step = 0 nb_tr_steps = 0 tr_loss = 0 save_best_model = eval_data is not None and args.eval_interval > 0 train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) optimizer, t_total = get_optimizer(args, model, num_train_steps) train_data = prepare(args, processor, label_list, tokenizer, 'train') logger.info("***** Running distillation *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) train_steps = 0 best_eval_accuracy = 0 for epoch in trange(int(args.num_train_epochs), desc="Epoch", dynamic_ncols=True): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 args.kd_policy.on_epoch_begin(model, None, None) for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", dynamic_ncols=True)): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch model.train() logits = args.kd_policy.forward(input_ids, segment_ids, input_mask) loss = CrossEntropyLoss()(logits.view(-1, num_labels), label_ids.view(-1)) loss = args.kd_policy.before_backward_pass(model, epoch, None, None, loss, None).overall_loss if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() train_steps += 1 tensorboard_logger.add_scalar('distillation_train_loss', loss.item(), train_steps) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear(global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if save_best_model and train_steps % args.eval_interval == 0: eval_loss, eval_accuracy, _ = eval(args, model, eval_data, device, verbose=False) tensorboard_logger.add_scalar('distillation_dev_loss', eval_loss, train_steps) tensorboard_logger.add_scalar('distillation_dev_accuracy', eval_accuracy, train_steps) if eval_accuracy > best_eval_accuracy: save_model(model, output_model_file) best_eval_accuracy = eval_accuracy args.kd_policy.on_epoch_end(model, None, None) if save_best_model: eval_loss, eval_accuracy, _ = eval(args, model, eval_data, device, verbose=False) if eval_accuracy > best_eval_accuracy: save_model(model, output_model_file) else: save_model(model, output_model_file) return global_step, tr_loss / nb_tr_steps