class CustomScheduler(_LRScheduler): timestep: int = 0 def __init__(self, optimizer, gamma, warmup=None): self.optimizer = optimizer self.after_warmup = ExponentialLR(optimizer, gamma=gamma) self.initial_lrs = [ p_group['lr'] for p_group in self.optimizer.param_groups ] self.warmup = 0 if warmup is None else warmup super(CustomScheduler, self).__init__(optimizer) def get_lr(self): return [self.timestep * group_init_lr / self.warmup for group_init_lr in self.initial_lrs] \ if self.timestep < self.warmup else self.after_warmup.get_lr() def step(self, epoch=None): if self.timestep < self.warmup: self.timestep += 1 super(CustomScheduler, self).step(epoch) else: self.after_warmup.step(epoch)
class ParamOptim: def __init__( self, params: List[torch.Tensor], lr: LRParam, eps: float = .0003, clip_grad: float = None, optimizer: Optimizer = Adam, retain_graph=False, ): self.params = params self.clip_grad = clip_grad self.optim = optimizer(self.params, lr=lr.start, eps=eps) self.retain_graph = retain_graph self.lr_scheduler = ExponentialLR(self.optim, lr.decay_rate) self.lr = lr self.lr_need_update = True def step_lr(self, n_iter): if self.lr_need_update or\ (n_iter % self.lr.update_every == 0 and n_iter // self.lr.update_every <= self.lr.last_update): self.lr_need_update = False ep = min(n_iter // self.lr.update_every, self.lr.last_update) self.lr_scheduler.step(ep) return self.lr_scheduler.get_lr()[0] else: return None def step(self, loss): self.optim.zero_grad() loss.backward(retain_graph=self.retain_graph) if self.clip_grad is not None: torch.nn.utils.clip_grad_norm_(self.params, self.clip_grad) self.optim.step() return loss
params_torch['SDF.bsdf.reflectance.data'].data.clamp_(0.0, 1.0) try: sdf = params_torch['SDF.data'].data.cpu().numpy().reshape([sdf_res] * 3) sdf = skfmm.distance(sdf, sdf_scale / sdf_res) vtk.record_epoch(epoch, sdf, grad) params_torch['SDF.data'].data.copy_(torch.from_numpy(sdf.flatten())) if epoch % 10 == 9: write_binary_grid3d(f'{out_path}sdf_e{epoch}.vol', sdf) write_bitmap(f'{out_path}color_e{epoch:03d}.exr', params['SDF.bsdf.reflectance.data'], [color_texture_res] * 2) except RuntimeError as e: print( f'skfmm failed: mean={sdf.mean()}, min={sdf.min()}, max={sdf.max()}' ) print(e) with open(f"{out_path}log.txt", mode='a+') as f: # f.write(','.join(list(map(str, [epoch, lr_scheduler.get_lr()[0], loss_img, *(pyramid_loss.cpu().numpy()), '\n'])))) f.write(','.join( list(map(str, [epoch, lr_scheduler.get_lr()[0], loss_img, "\n"])))) print(f'epoch {epoch}: loss_img={loss_img}')
def train(model_name, optim='adam'): train_dataset = PretrainDataset(output_shape=config['image_resolution']) train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, num_workers=8, pin_memory=True, drop_last=True) val_dataset = IDRND_dataset_CV(fold=0, mode=config['mode'].replace('train', 'val'), double_loss_mode=True, output_shape=config['image_resolution']) val_loader = DataLoader(val_dataset, batch_size=config['batch_size'], shuffle=True, num_workers=4, drop_last=False) if model_name == 'EF': model = DoubleLossModelTwoHead(base_model=EfficientNet.from_pretrained( 'efficientnet-b3')).to(device) model.load_state_dict( torch.load( f"../models_weights/pretrained/{model_name}_{4}_2.0090592697255896_1.0.pth" )) elif model_name == 'EFGAP': model = DoubleLossModelTwoHead( base_model=EfficientNetGAP.from_pretrained('efficientnet-b3')).to( device) model.load_state_dict( torch.load( f"../models_weights/pretrained/{model_name}_{4}_2.3281182915644134_1.0.pth" )) criterion = FocalLoss(add_weight=False).to(device) criterion4class = CrossEntropyLoss().to(device) if optim == 'adam': optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate'], weight_decay=config['weight_decay']) elif optim == 'sgd': optimizer = torch.optim.SGD(model.parameters(), lr=config['learning_rate'], weight_decay=config['weight_decay'], nesterov=False) else: optimizer = torch.optim.SGD(model.parameters(), momentum=0.9, lr=config['learning_rate'], weight_decay=config['weight_decay'], nesterov=True) steps_per_epoch = train_loader.__len__() - 15 swa = SWA(optimizer, swa_start=config['swa_start'] * steps_per_epoch, swa_freq=int(config['swa_freq'] * steps_per_epoch), swa_lr=config['learning_rate'] / 10) scheduler = ExponentialLR(swa, gamma=0.9) # scheduler = StepLR(swa, step_size=5*steps_per_epoch, gamma=0.5) global_step = 0 for epoch in trange(10): if epoch < 5: scheduler.step() continue model.train() train_bar = tqdm(train_loader) train_bar.set_description_str(desc=f"N epochs - {epoch}") for step, batch in enumerate(train_bar): global_step += 1 image = batch['image'].to(device) label4class = batch['label0'].to(device) label = batch['label1'].to(device) output4class, output = model(image) loss4class = criterion4class(output4class, label4class) loss = criterion(output.squeeze(), label) swa.zero_grad() total_loss = loss4class * 0.5 + loss * 0.5 total_loss.backward() swa.step() train_writer.add_scalar(tag="learning_rate", scalar_value=scheduler.get_lr()[0], global_step=global_step) train_writer.add_scalar(tag="BinaryLoss", scalar_value=loss.item(), global_step=global_step) train_writer.add_scalar(tag="SoftMaxLoss", scalar_value=loss4class.item(), global_step=global_step) train_bar.set_postfix_str(f"Loss = {loss.item()}") try: train_writer.add_scalar(tag="idrnd_score", scalar_value=idrnd_score_pytorch( label, output), global_step=global_step) train_writer.add_scalar(tag="far_score", scalar_value=far_score(label, output), global_step=global_step) train_writer.add_scalar(tag="frr_score", scalar_value=frr_score(label, output), global_step=global_step) train_writer.add_scalar(tag="accuracy", scalar_value=bce_accuracy( label, output), global_step=global_step) except Exception: pass if (epoch > config['swa_start'] and epoch % 2 == 0) or (epoch == config['number_epochs'] - 1): swa.swap_swa_sgd() swa.bn_update(train_loader, model, device) swa.swap_swa_sgd() scheduler.step() evaluate(model, val_loader, epoch, model_name)
def train(self): """Method to train the model.""" self.writer.add_text('Comments', self.comments) train_loader, val_loader, test_loader = self.dataloaders transformations = get_transformations(self.transform_names, sizes=(self.w_size, self.h_size)) self._set_seeds() self.net.apply(self._init_weights) running_losses = list() criterion = define_loss(self.signal_type, self.custom_loss, self.device) optimizer = optim.Adam(self.net.parameters(), lr=self.lr) scheduler = ExponentialLR(optimizer, gamma=0.9) iteration = 0 best_val_prec = 0 self.net.to(self.device) for epoch in range(self.nb_epochs): if epoch % self.lr_step == 0 and epoch != 0: scheduler.step() for _, sequence_data in enumerate(train_loader): seq_name, seq = sequence_data path_to_frames = os.path.join(self.paths['carrada'], seq_name[0]) frame_dataloader = DataLoader(MultiFrameCarradaDataset( seq, self.annot_type, self.signal_type, path_to_frames, self.process_signal, self.n_input_ch, transformations), shuffle=False, batch_size=self.batch_size, num_workers=4) for _, frame in enumerate(frame_dataloader): data = frame['matrix'].to(self.device).float() mask = frame['mask'].to(self.device).float() data = normalize(data, self.signal_type, self.paths['carrada'], norm_type=self.norm_type) optimizer.zero_grad() outputs = self.net(data).to(self.device) mask = F.interpolate(mask, (self.w_size, self.h_size)) loss = criterion(outputs, torch.argmax(mask, axis=1)) loss.backward() optimizer.step() running_losses.append(loss.data.cpu().numpy()[()]) if iteration % self.loss_step == 0: train_loss = np.mean(running_losses) print('[Epoch {}/{}, iter {}]: ' 'train loss {}'.format(epoch + 1, self.nb_epochs, iteration, train_loss)) self.visualizer.update_train_loss( train_loss, iteration) running_losses = list() self.visualizer.update_learning_rate( scheduler.get_lr()[0], iteration) if iteration % self.val_step == 0 and iteration > 0: if iteration % self.viz_step == 0 and iteration > 0: val_metrics = self.tester.predict( self.net, val_loader, iteration) else: val_metrics = self.tester.predict( self.net, val_loader) self.visualizer.update_val_metrics( val_metrics, iteration) print('[Epoch {}/{}] Validation loss: {}'.format( epoch + 1, self.nb_epochs, val_metrics['loss'])) print('[Epoch {}/{}] Validation Pixel Prec: {}'.format( epoch + 1, self.nb_epochs, val_metrics['prec'])) print('[Epoch {}/{}] Validation Pixel Prec by class: ' '{}'.format(epoch + 1, self.nb_epochs, val_metrics['prec_by_class'])) if val_metrics[ 'prec'] > best_val_prec and iteration > 0: best_val_prec = val_metrics['prec'] test_metrics = self.tester.predict( self.net, test_loader) print('[Epoch {}/{}] Test loss: {}'.format( epoch + 1, self.nb_epochs, test_metrics['loss'])) print('[Epoch {}/{}] Test Pixel Prec: {}'.format( epoch + 1, self.nb_epochs, test_metrics['prec'])) print('[Epoch {}/{}] Test Pixel Prec by class: ' '{}'.format(epoch + 1, self.nb_epochs, test_metrics['prec_by_class'])) self.results['train_loss'] = train_loss.item() self.results['val_metrics'] = val_metrics self.results['test_metrics'] = test_metrics self._save_results() self.net.train() # Train mode after evaluation process iteration += 1 self.writer.close()
def train(self, x_train, y_train): idx = np.random.permutation(len(x_train)) print('samples:', len(x_train)) x_train = np.array(x_train)[idx] y_train = np.array(y_train)[idx] x_train = torch.tensor(x_train, dtype=torch.float32) y_train = torch.tensor(y_train) x_val = x_train[32000:] x_tr = x_train[:32000] y_val = y_train[32000:] y_tr = y_train[:32000] #x_val = x_train[5000:6000] #y_val = y_train[5000:6000] #x_tr = x_train[:5000] #y_tr = y_train[:5000] y_tr_ = y_tr.clone() x_tr, x_val = self.PCA(x_tr, x_val) print('PCA done', x_tr.shape) optimizer = Adam(self.model.parameters(), lr=0.0001, weight_decay=5e-3) scheduler = ExponentialLR(optimizer, 1) loss_fn = nn.MSELoss() train_loader = torch.utils.data.DataLoader(Loader(x_tr, y_tr), batch_size=256, shuffle=True) best_acc = 0 dist = self.get_dist(x_tr.cuda(), x_val.cuda()).cpu() for k in [1, 3, 5, 8, 10, 15, 20, 25, 30]: y_pred = self.predict(dist, y_tr_, k) acc = (y_pred == y_val).sum().float().numpy() / y_val.shape[0] print("K=", k, " acc=", acc) for epoch in range(200): self.model.train() scheduler.step() loss_ = acc_ = cnt = yc = 0 for i, (input, target) in enumerate(train_loader): optimizer.zero_grad() B = target.shape[0] gt_p = target.clone().cuda().view(1, B).float() gt = target.clone().cuda() output = self.model(input.cuda()) dists = output.view(B, B) dm = dists.sum(0).view(1, -1) #dists = dists / dm sorted, ind = dists.sort(dim=0, descending=False) sorted = sorted[:20] ind = ind[:20] y_p = gt[ind] gt_p = gt_p.expand(20, -1).contiguous().float() y_p = y_p.float() - gt_p y_p[y_p != 0] = 1 yy = torch.sum(y_p) loss0 = torch.div(1, sorted[y_p != 0]) loss1 = sorted[y_p == 0] loss = loss0.mean() + loss1.mean() loss.backward() optimizer.step() lr = scheduler.get_lr() yc += yy.cpu().data.numpy() loss_ += loss.cpu().data.numpy() cnt += 1 print('Epoch %2d: loss = %6.5f, %5.3f, lr=%f' % (epoch, loss_ / cnt, yc / cnt, lr[0])) loss_ = yc = 0 if (epoch % 20) == 19: dist = self.get_dist(x_tr.cuda(), x_val.cuda()).cpu() for k in [1, 3, 5, 8, 10, 15, 20, 25, 30]: y_pred = self.predict(dist, y_tr_, k) acc = (y_pred == y_val).sum().float().numpy() / y_val.shape[0] print("K=", k, " acc=", acc) if k == 25: acc_25 = acc torch.save(self.model.state_dict(), 'knn_dml_checkpoint.pth') if best_acc <= acc_25: best_acc = acc_25 torch.save(self.model.state_dict(), 'knn_dml_best_model.pth')
def train(**kwargs): opt._parse(kwargs) carrada = download('Carrada') train_set = Carrada().get('Train') val_set = Carrada().get('Validation') test_set = Carrada().get('Test') train_seqs = SequenceCarradaDataset(train_set) val_seqs = SequenceCarradaDataset(val_set) test_seqs = SequenceCarradaDataset(test_set) train_seqs_loader = data_.DataLoader(train_seqs, \ batch_size=1, \ shuffle=True, \ # pin_memory=True, num_workers=opt.num_workers) val_seqs_loader = data_.DataLoader(val_seqs, batch_size=1, shuffle=False, # pin_memory=True, num_workers=opt.num_workers) test_seqs_loader = data_.DataLoader(test_seqs, batch_size=1, shuffle=False, # pin_memory=True, num_workers=opt.num_workers) # faster_rcnn = FasterRCNNVGG16(n_fg_class=3) # faster_rcnn = FasterRCNNRESNET101(n_fg_class=3) faster_rcnn = FasterRCNNRESNET18(n_fg_class=3) print('model construct completed') trainer = FasterRCNNTrainer(faster_rcnn).cuda() scheduler = ExponentialLR(trainer.faster_rcnn.optimizer, gamma=0.9) if opt.load_path: trainer.load(opt.load_path) print('load pretrained model from %s' % opt.load_path) writer_path = os.path.join(opt.logs_path, opt.model_name) os.makedirs(writer_path, exist_ok=True) writer = SummaryWriter(writer_path) iteration = 0 best_map = 0 lr_ = opt.lr for epoch in range(opt.epoch): print('Processing epoch: {}/{}'.format(epoch, opt.epoch)) trainer.reset_meters() for n_seq, sequence_data in tqdm(enumerate(train_seqs_loader)): seq_name, seq = sequence_data path_to_frames = os.path.join(carrada, seq_name[0]) train_frame_set = CarradaDataset(opt, seq, 'box', opt.signal_type, path_to_frames) train_frame_loader = data_.DataLoader(train_frame_set, batch_size=1, shuffle=False, num_workers=opt.num_workers) for ii, (img, bbox_, label_, scale) in tqdm(enumerate(train_frame_loader)): iteration += 1 scale = at.scalar(scale) img, bbox, label = img.cuda().float(), bbox_.cuda(), label_.cuda() img = normalize(img) if opt.debug_step and (iteration+1) % opt.debug_step == 0: trainer.train_step(img, bbox, label, scale, stop=True) else: trainer.train_step(img, bbox, label, scale) if (iteration + 1) % opt.plot_every == 0: if os.path.exists(opt.debug_file): ipdb.set_trace() train_results = trainer.get_meter_data() writer.add_scalar('Losses/rpn_loc', train_results['rpn_loc_loss'], iteration) writer.add_scalar('Losses/rpn_cls', train_results['rpn_cls_loss'], iteration) writer.add_scalar('Losses/roi_loc', train_results['roi_loc_loss'], iteration) writer.add_scalar('Losses/roi_cls', train_results['roi_cls_loss'], iteration) writer.add_scalar('Losses/total', train_results['total_loss'], iteration) if (iteration + 1) % opt.img_every == 0: ori_img_ = at.tonumpy(img[0]) gt_img = visdom_bbox(ori_img_, at.tonumpy(bbox_[0]), at.tonumpy(label_[0])) gt_img_grid = make_grid(torch.from_numpy(gt_img)) writer.add_image('Ground_truth_img', gt_img_grid, iteration) # plot predicti bboxes _bboxes, _labels, _scores = trainer.faster_rcnn.predict([ori_img_], opt.signal_type, visualize=True) # FLAG: vis pred_img = visdom_bbox(ori_img_, at.tonumpy(_bboxes[0]), at.tonumpy(_labels[0]).reshape(-1), at.tonumpy(_scores[0])) pred_img_grid = make_grid(torch.from_numpy(pred_img)) writer.add_image('Predicted_img', pred_img_grid, iteration) if opt.train_eval and (iteration + 1) % opt.train_eval == 0: train_eval_result, train_best_iou = eval(train_seqs_loader, faster_rcnn, opt.signal_type) writer.add_scalar('Train/mAP', train_eval_result['map'], iteration) writer.add_scalar('Train/Best_IoU', train_best_iou, iteration) eval_result, best_val_iou = eval(val_seqs_loader, faster_rcnn, opt.signal_type, test_num=opt.test_num) writer.add_scalar('Validation/mAP', eval_result['map'], iteration) writer.add_scalar('Validation/Best_IoU', best_val_iou, iteration) lr_ = scheduler.get_lr()[0] writer.add_scalar('learning_rate', lr_, iteration) log_info = 'lr:{}, map:{},loss:{}'.format(str(lr_), str(eval_result['map']), str(trainer.get_meter_data())) print(log_info) if eval_result['map'] > best_map: test_result, test_best_iou = eval(test_seqs_loader, faster_rcnn, opt.signal_type, test_num=opt.test_num) writer.add_scalar('Test/mAP', test_result['map'], iteration) writer.add_scalar('Test/Best_IoU', test_best_iou, iteration) best_map = eval_result['map'] best_test_map = test_result['map'] best_path = trainer.save(best_val_map=best_map, best_test_map=best_test_map) # best_path = trainer.save(best_map=best_map) if (epoch + 1) % opt.lr_step == 0: scheduler.step()
progressbar.DynamicMessage('loss_encoder'), ' ', progressbar.DynamicMessage('loss_decoder'), ' ', progressbar.DynamicMessage('loss_discriminator'), ' ', progressbar.DynamicMessage("epoch") ] for i in range(num_epochs): progress = progressbar.ProgressBar(min_value=0, max_value=batch_number, initial_value=0, widgets=widgets).start() loss_nle_mean = RollingMeasure() loss_encoder_mean = RollingMeasure() loss_decoder_mean = RollingMeasure() loss_discriminator_mean = RollingMeasure() print("LR:{}".format(lr_encoder.get_lr())) for j, (data_batch, labels_batch) in enumerate(dataloader): net.train() # trasformo in variabili data_target = Variable(torch.squeeze(data_batch), requires_grad=False).float().cuda() data_in = Variable(data_batch, requires_grad=True).float().cuda() # azzero gradiente net.zero_grad() # calcolo uscita out, out_labels, out_layer, mus, variances = net(data_in) out_layer_original = out_layer[:len(out_layer) // 2]
print(f"rendered image {i}: loss={ob_val.cpu().item()}" ) # , pyramid_losses={list(pyr_ob.data.cpu().numpy())}") ob_val /= len(cams_origins) loss_img += ob_val.item() # loss_pyr += pyr_ob.data.cpu().numpy() ob_val.backward() if smoothing: params_torch['SDF.data'].grad = smoothing( params_torch['SDF.data'].grad) opt.step() if lr_scheduler: print("lr = ", lr_scheduler.get_lr()[0]) lr_scheduler.step() try: sdf = params_torch['SDF.data'].data.cpu().numpy().reshape([sdf_res] * 3) sdf = skfmm.distance(sdf, sdf_scale / sdf_res) params_torch['SDF.data'].data.copy_(torch.from_numpy(sdf.flatten())) if epoch % 10 == 0: write_binary_grid3d(f'{out_path}sdf_e{epoch}.vol', sdf) except RuntimeError as e: print( f'skfmm failed: mean={sdf.mean()}, min={sdf.min()}, max={sdf.max()}'
train_bar.set_description_str(desc=f"N epochs - {epoch}") for step, batch in enumerate(train_bar): global_step += 1 image = batch['image'].to(device) label4class = batch['label0'].to(device) label = batch['label1'].to(device) output4class, output = model(image) loss4class = criterion4class(output4class, label4class) loss = criterion(output.squeeze(), label) swa.zero_grad() total_loss = loss4class*0.5 + loss*0.5 total_loss.backward() swa.step() train_writer.add_scalar(tag="learning_rate", scalar_value=scheduler.get_lr()[0], global_step=global_step) train_writer.add_scalar(tag="BinaryLoss", scalar_value=loss.item(), global_step=global_step) train_writer.add_scalar(tag="SoftMaxLoss", scalar_value=loss4class.item(), global_step=global_step) train_bar.set_postfix_str(f"Loss = {loss.item()}") try: train_writer.add_scalar(tag="idrnd_score", scalar_value=idrnd_score_pytorch(label, output), global_step=global_step) train_writer.add_scalar(tag="far_score", scalar_value=far_score(label, output), global_step=global_step) train_writer.add_scalar(tag="frr_score", scalar_value=frr_score(label, output), global_step=global_step) train_writer.add_scalar(tag="accuracy", scalar_value=bce_accuracy(label, output), global_step=global_step) except Exception: pass if (epoch > config['swa_start'] and epoch % 2 == 0) or (epoch == config['number_epochs']-1): swa.swap_swa_sgd() swa.bn_update(train_loader, model, device) swa.swap_swa_sgd()
def train(self, add_temp=False): """ Method to train a network PARAMETERS ---------- add_temp: boolean Add a temporal dimension during training? Considering the input as a sequence. Default: False """ self.writer.add_text('Comments', self.comments) train_loader, val_loader, test_loader = self.dataloaders transformations = get_transformations(self.transform_names, sizes=(self.w_size, self.h_size)) self._set_seeds() self.net.apply(self._init_weights) rd_criterion = define_loss('range_doppler', self.custom_loss, self.device) ra_criterion = define_loss('range_angle', self.custom_loss, self.device) nb_losses = len(rd_criterion) running_losses = list() rd_running_losses = list() rd_running_global_losses = [list(), list()] ra_running_losses = list() ra_running_global_losses = [list(), list()] coherence_running_losses = list() optimizer = optim.Adam(self.net.parameters(), lr=self.lr) scheduler = ExponentialLR(optimizer, gamma=0.9) iteration = 0 best_val_prec = 0 self.net.to(self.device) for epoch in range(self.nb_epochs): if epoch % self.lr_step == 0 and epoch != 0: scheduler.step() for _, sequence_data in enumerate(train_loader): seq_name, seq = sequence_data path_to_frames = os.path.join(self.paths['carrada'], seq_name[0]) frame_dataloader = DataLoader(CarradaDataset( seq, self.annot_type, path_to_frames, self.process_signal, self.n_frames, transformations, add_temp), shuffle=self.is_shuffled, batch_size=self.batch_size, num_workers=4) for _, frame in enumerate(frame_dataloader): rd_data = frame['rd_matrix'].to(self.device).float() ra_data = frame['ra_matrix'].to(self.device).float() ad_data = frame['ad_matrix'].to(self.device).float() rd_mask = frame['rd_mask'].to(self.device).float() ra_mask = frame['ra_mask'].to(self.device).float() rd_data = normalize(rd_data, 'range_doppler', norm_type=self.norm_type) ra_data = normalize(ra_data, 'range_angle', norm_type=self.norm_type) if self.model_name == 'tmvanet': ad_data = normalize(ad_data, 'angle_doppler', norm_type=self.norm_type) optimizer.zero_grad() if self.model_name == 'tmvanet': rd_outputs, ra_outputs = self.net( rd_data, ra_data, ad_data) else: rd_outputs, ra_outputs = self.net(rd_data, ra_data) rd_outputs = rd_outputs.to(self.device) ra_outputs = ra_outputs.to(self.device) if nb_losses < 3: # Case without the CoL rd_losses = [ c(rd_outputs, torch.argmax(rd_mask, axis=1)) for c in rd_criterion ] rd_loss = torch.mean(torch.stack(rd_losses)) ra_losses = [ c(ra_outputs, torch.argmax(ra_mask, axis=1)) for c in ra_criterion ] ra_loss = torch.mean(torch.stack(ra_losses)) loss = torch.mean(rd_loss + ra_loss) else: # Case with the CoL # Select the wCE and wSDice rd_losses = [ c(rd_outputs, torch.argmax(rd_mask, axis=1)) for c in rd_criterion[:2] ] rd_loss = torch.mean(torch.stack(rd_losses)) ra_losses = [ c(ra_outputs, torch.argmax(ra_mask, axis=1)) for c in ra_criterion[:2] ] ra_loss = torch.mean(torch.stack(ra_losses)) # Coherence loss coherence_loss = rd_criterion[2](rd_outputs, ra_outputs) loss = torch.mean(rd_loss + ra_loss + coherence_loss) loss.backward() optimizer.step() running_losses.append(loss.data.cpu().numpy()[()]) rd_running_losses.append(rd_loss.data.cpu().numpy()[()]) rd_running_global_losses[0].append( rd_losses[0].data.cpu().numpy()[()]) rd_running_global_losses[1].append( rd_losses[1].data.cpu().numpy()[()]) ra_running_losses.append(ra_loss.data.cpu().numpy()[()]) ra_running_global_losses[0].append( ra_losses[0].data.cpu().numpy()[()]) ra_running_global_losses[1].append( ra_losses[1].data.cpu().numpy()[()]) if nb_losses > 2: coherence_running_losses.append( coherence_loss.data.cpu().numpy()[()]) if iteration % self.loss_step == 0: train_loss = np.mean(running_losses) rd_train_loss = np.mean(rd_running_losses) rd_train_losses = [ np.mean(sub_loss) for sub_loss in rd_running_global_losses ] ra_train_loss = np.mean(ra_running_losses) ra_train_losses = [ np.mean(sub_loss) for sub_loss in ra_running_global_losses ] if nb_losses > 2: coherence_train_loss = np.mean( coherence_running_losses) print('[Epoch {}/{}, iter {}]: ' 'train loss {}'.format(epoch + 1, self.nb_epochs, iteration, train_loss)) if nb_losses > 2: self.visualizer.update_multi_train_loss( train_loss, rd_train_loss, rd_train_losses, ra_train_loss, ra_train_losses, iteration, coherence_train_loss) else: self.visualizer.update_multi_train_loss( train_loss, rd_train_loss, rd_train_losses, ra_train_loss, ra_train_losses, iteration) running_losses = list() rd_running_losses = list() ra_running_losses = list() self.visualizer.update_learning_rate( scheduler.get_lr()[0], iteration) if iteration % self.val_step == 0 and iteration > 0: if iteration % self.viz_step == 0 and iteration > 0: val_metrics = self.tester.predict( self.net, val_loader, iteration, add_temp=add_temp) else: val_metrics = self.tester.predict( self.net, val_loader, add_temp=add_temp) self.visualizer.update_multi_val_metrics( val_metrics, iteration) print('[Epoch {}/{}] Validation losses: ' 'RD={}, RA={}'.format( epoch + 1, self.nb_epochs, val_metrics['range_doppler']['loss'], val_metrics['range_angle']['loss'])) print('[Epoch {}/{}] Validation Pixel Prec: ' 'RD={}, RA={}'.format( epoch + 1, self.nb_epochs, val_metrics['range_doppler']['prec'], val_metrics['range_angle']['prec'])) if val_metrics[ 'global_prec'] > best_val_prec and iteration > 0: best_val_prec = val_metrics['global_prec'] test_metrics = self.tester.predict( self.net, test_loader, add_temp=add_temp) print('[Epoch {}/{}] Test losses: ' 'RD={}, RA={}'.format( epoch + 1, self.nb_epochs, test_metrics['range_doppler']['loss'], test_metrics['range_angle']['loss'])) print('[Epoch {}/{}] Test Prec: ' 'RD={}, RA={}'.format( epoch + 1, self.nb_epochs, test_metrics['range_doppler']['prec'], test_metrics['range_angle']['prec'])) self.results['rd_train_loss'] = rd_train_loss.item( ) self.results['ra_train_loss'] = ra_train_loss.item( ) self.results['train_loss'] = train_loss.item() self.results['val_metrics'] = val_metrics self.results['test_metrics'] = test_metrics if nb_losses > 3: self.results[ 'coherence_train_loss'] = coherence_train_loss.item( ) self._save_results() self.net.train() # Train mode after evaluation process iteration += 1 self.writer.close()
os.makedirs(args.checkpoints_dir) if args.explore: scheduler = ExponentialLR(optimizer, 2.0) else: scheduler = MultiStepLR(optimizer, milestones=args.learning_steps, gamma=args.learning_gamma, last_epoch=starting_epoch - 1) # training loop print(f"training for {args.nb_epochs} epochs") losses = [] learning_rates = [] for epoch in range(starting_epoch, args.nb_epochs): scheduler.step() logger.write({'learning rate': scheduler.get_lr()[0]}, index=epoch) for step in ['train', 'test']: metrics = train_eval(model, dataloaders, optimizer, step == 'train') logger.write(metrics, curve=f"mean_{step}", increment=False) print( " ", end='\r') print('{}\tEpoch [{}/{}],\tLoss: {:.4f},\tAccuracy: {:.2f}%\t'. format(step, epoch, args.nb_epochs, metrics['loss'], metrics['accuracy'] * 100), flush=True) learning_rates.append(scheduler.get_lr()[0]) # TODO save best model according to loss
class Trainer(object): ''' An object that encapsulates model training ''' def __init__(self, config, model, dataloader, device): self.model = model self.config = config self.device = device self.stopped_early = False self.dataloader = dataloader self.validation_dataloader = dataloader self.last_checkpoint_time = time.time() if 'cuda' in device.type: self.model = nn.DataParallel(model.cuda()) self.optimizer = optim.Adam(model.parameters(), config.base_lr, betas=(0.9, 0.98), eps=1e-9) if config.lr_scheduler == 'warmup': self.lr_scheduler = LambdaLR( self.optimizer, WarmupLRSchedule( config.warmup_steps ) ) elif config.lr_scheduler == 'linear': self.lr_scheduler = LambdaLR( self.optimizer, LinearLRSchedule( config.base_lr, config.final_lr, config.max_steps ) ) elif config.lr_scheduler == 'exponential': self.lr_scheduler = ExponentialLR( self.optimizer, config.lr_decay ) else: raise ValueError('Unknown learning rate scheduler!') # Initialize the metrics metrics_path = os.path.join(self.config.checkpoint_directory, 'train_metrics.pt') self.metric_store = metrics.MetricStore(metrics_path) self.metric_store.add(metrics.Metric('oom', metrics.format_int, 't')) self.metric_store.add(metrics.Metric('nll', metrics.format_float, max_history=1000)) self.metric_store.add(metrics.Metric('lr', metrics.format_scientific, 'g', max_history=1)) self.metric_store.add(metrics.Metric('num_tok', metrics.format_int, 'a', max_history=1000)) if self.config.early_stopping: self.metric_store.add(metrics.Metric('vnll', metrics.format_float, 'g')) self.modules = { 'model': model, 'optimizer': self.optimizer, 'lr_scheduler': self.lr_scheduler } @property def dataset(self): ''' Get the dataset ''' return self.dataloader.dataset def train_epoch(self, epoch, experiment, verbose=0): ''' Run one training epoch ''' oom = self.metric_store['oom'] learning_rate = self.metric_store['lr'] num_tokens = self.metric_store['num_tok'] neg_log_likelihood = self.metric_store['nll'] def try_optimize(i, last=False): # optimize if: # 1) last and remainder # 2) not last and not remainder remainder = bool(i % self.config.accumulate_steps) if not last ^ remainder: next_lr = self.optimize() learning_rate.update(next_lr) experiment.log_metric('learning_rate', next_lr) return True return False def get_description(): description = f'Train #{epoch}' if verbose > 0: description += f' {self.metric_store}' if verbose > 1: description += f' [{profile.mem_stat_string(["allocated"])}]' return description batches = tqdm( self.dataloader, unit='batch', dynamic_ncols=True, desc=get_description(), file=sys.stdout # needed to make tqdm_wrap_stdout work ) with tqdm_wrap_stdout(): i = 1 nll_per_update = 0. length_per_update = 0 num_tokens_per_update = 0 for i, batch in enumerate(batches, 1): try: nll, length = self.calculate_gradient(batch) did_optimize = try_optimize(i) # record the effective number of tokens num_tokens_per_update += int(sum(batch['input_lens'])) num_tokens_per_update += int(sum(batch['target_lens'])) if length: # record length and nll nll_per_update += nll length_per_update += length if did_optimize: # advance the experiment step experiment.set_step(experiment.curr_step + 1) num_tokens.update(num_tokens_per_update) neg_log_likelihood.update(nll_per_update / length_per_update) experiment.log_metric('num_tokens', num_tokens_per_update) experiment.log_metric('nll', neg_log_likelihood.last_value) nll_per_update = 0. length_per_update = 0 num_tokens_per_update = 0 except RuntimeError as rte: if 'out of memory' in str(rte): torch.cuda.empty_cache() oom.update(1) experiment.log_metric('oom', oom.total) else: batches.close() raise rte if self.should_checkpoint(): new_best = False if self.config.early_stopping: with tqdm_unwrap_stdout(): new_best = self.evaluate(experiment, epoch, verbose) self.checkpoint(epoch, experiment.curr_step, new_best) batches.set_description_str(get_description()) if self.is_done(experiment, epoch): batches.close() break try_optimize(i, last=True) def should_checkpoint(self): ''' Function which determines if a new checkpoint should be saved ''' return time.time() - self.last_checkpoint_time > self.config.checkpoint_interval def checkpoint(self, epoch, step, best=False): ''' Save a checkpoint ''' checkpoint_path = checkpoint( epoch, step, self.modules, self.config.checkpoint_directory, max_checkpoints=self.config.max_checkpoints ) if best: dirname = os.path.dirname(checkpoint_path) basename = os.path.basename(checkpoint_path) best_checkpoint_path = os.path.join(dirname, f'best_{basename}') shutil.copy2(checkpoint_path, best_checkpoint_path) self.metric_store.save() self.last_checkpoint_time = time.time() def evaluate(self, experiment, epoch, verbose=0): ''' Evaluate the current model and determine if it is a new best ''' model = self.modules['model'] evaluator = Evaluator(args.ArgGroup(None), model, self.validation_dataloader, self.device) vnll = evaluator(epoch, experiment, verbose) metric = self.metric_store['vnll'] full_history = metric.values metric.update(vnll) self.metric_store.save() return all(vnll < nll for nll in full_history[:-1]) def is_done(self, experiment, epoch): ''' Has training completed ''' if self.config.max_steps and experiment.curr_step >= self.config.max_steps: return True if self.config.max_epochs and epoch >= self.config.max_epochs: return True if self.config.early_stopping: history = self.metric_store['vnll'].values[-self.config.early_stopping - 1:] if len(history) == self.config.early_stopping + 1: self.stopped_early = all(history[-1] > nll for nll in history[:-1]) return self.stopped_early return False def optimize(self): ''' Calculate an optimization step ''' self.lr_scheduler.step() self.optimizer.step() self.optimizer.zero_grad() return self.lr_scheduler.get_lr()[0] def calculate_gradient(self, batch): ''' Runs one step of optimization ''' # run the data through the model self.model.train() loss, nll = self.model(batch) # nn.DataParallel wants to gather rather than doing a reduce_add, so the output here # will be a tensor of values that must be summed nll = nll.sum() loss = loss.sum() # calculate gradients then run an optimization step loss.backward() # need to use .item() which converts to Python scalar # because as a Tensor it accumulates gradients return nll.item(), torch.sum(batch['target_lens']).item() def __call__(self, start_epoch, experiment, verbose=0): ''' Execute training ''' with ExitStack() as stack: stack.enter_context(chunked_scattering()) stack.enter_context(experiment.train()) if start_epoch > 0 or experiment.curr_step > 0: # TODO: Hacky approach to decide if the metric store should be loaded. Revisit later self.metric_store = self.metric_store.load() epoch = start_epoch experiment.log_current_epoch(epoch) while not self.is_done(experiment, epoch): experiment.log_current_epoch(epoch) self.train_epoch(epoch, experiment, verbose) experiment.log_epoch_end(epoch) epoch += 1 if self.stopped_early: print('Stopping early!') else: new_best = False if self.config.early_stopping: new_best = self.evaluate(experiment, epoch, verbose) self.checkpoint(epoch, experiment.curr_step, new_best)
opt.step() try: sdf = params_torch['SDF.data'].data.cpu().numpy().reshape([sdf_res] * 3) sdf = skfmm.distance(sdf, sdf_scale / sdf_res) params_torch['SDF.data'].data.copy_(torch.from_numpy(sdf.flatten())) if epoch % 10 == 9: write_binary_grid3d(f'{out_path}sdf_e{epoch}.vol', sdf) except RuntimeError as e: print( f'skfmm failed: mean={sdf.mean()}, min={sdf.min()}, max={sdf.max()}' ) print(e) print( f'epoch {epoch}: lr={lr_scheduler.get_lr()[0]}, total_loss={np.mean(loss_imgs)}, loss_imgs={loss_imgs}' ) with open(f"{out_path}log.txt", mode='a+') as f: f.write(','.join( list( map(str, [ epoch, lr_scheduler.get_lr()[0], np.mean(loss_imgs), *loss_imgs, "\n" ])))) lr_scheduler.step()
class Train(object): def __init__(self): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) time.sleep(15) self.val_batcher = Batcher(config.eval_data_path, self.vocab, mode='eval', batch_size=config.batch_size, single_pass=False) time.sleep(15) train_dir = os.path.join(config.log_root, 'train_%d' % (int(time.time()))) if not os.path.exists(train_dir): os.mkdir(train_dir) self.model_dir = os.path.join(train_dir, 'model') if not os.path.exists(self.model_dir): os.mkdir(self.model_dir) self.summary_writer = tf.compat.v1.summary.FileWriter(train_dir) def save_model(self, running_avg_loss, iter): state = { 'iter': iter, 'encoder_state_dict': self.model.encoder.state_dict(), 'decoder_state_dict': self.model.decoder.state_dict(), 'reduce_state_dict': self.model.reduce_state.state_dict(), 'optimizer': self.optimizer.state_dict(), 'current_loss': running_avg_loss } model_save_path = os.path.join( self.model_dir, 'model_%d_%d' % (iter, int(time.time()))) torch.save(state, model_save_path) def setup_train(self, model_file_path=None): self.model = Model(model_file_path) params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \ list(self.model.reduce_state.parameters()) initial_lr = config.lr_coverage if config.is_coverage else config.lr self.optimizer = Adagrad( params, lr=initial_lr, initial_accumulator_value=config.adagrad_init_acc) self.scheduler = ExponentialLR(self.optimizer, gamma=0.99) start_iter, start_loss = 0, 0 if model_file_path is not None: #途中から始める場合 state = torch.load(model_file_path, map_location=lambda storage, location: storage) start_iter = state['iter'] start_loss = state['current_loss'] if not config.is_coverage: #coverageが無しの場合 self.optimizer.load_state_dict(state['optimizer']) if use_cuda: for state in self.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() return start_iter, start_loss def train_one_batch(self, batch, iter): enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \ get_input_from_batch(batch, use_cuda) dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \ get_output_from_batch(batch, use_cuda) self.optimizer.zero_grad() encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder( enc_batch, enc_lens) s_t_1 = self.model.reduce_state(encoder_hidden) step_losses = [] words = [] for di in range(min(max_dec_len, config.max_dec_steps)): y_t_1 = dec_batch[:, di] # Teacher forcing final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder( y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di) words.append(self.vocab.id2word(final_dist[0].argmax().item())) target = target_batch[:, di] gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze() step_loss = -torch.log(gold_probs + config.eps) # print('step_loss',step_loss) # print('step_loss.size()',step_loss.size()) if config.is_coverage: step_coverage_loss = torch.sum(torch.min(attn_dist, coverage), 1) step_loss = step_loss + config.cov_loss_wt * step_coverage_loss coverage = next_coverage step_mask = dec_padding_mask[:, di] step_loss = step_loss * step_mask step_losses.append(step_loss) if iter % 100 == 0: print(words) print([self.vocab.id2word(idx.item()) for idx in dec_batch[0]]) print([self.vocab.id2word(idx.item()) for idx in target_batch[0]]) sum_losses = torch.sum(torch.stack(step_losses, 1), 1) batch_avg_loss = sum_losses / dec_lens_var loss = torch.mean(batch_avg_loss) loss.backward() self.norm = clip_grad_norm_(self.model.encoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.reduce_state.parameters(), config.max_grad_norm) self.optimizer.step() return loss.item() def eval_one_batch(self, batch): enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \ get_input_from_batch(batch, use_cuda) dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \ get_output_from_batch(batch, use_cuda) encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder( enc_batch, enc_lens) s_t_1 = self.model.reduce_state(encoder_hidden) step_losses = [] for di in range(min(max_dec_len, config.max_dec_steps)): y_t_1 = dec_batch[:, di] # Teacher forcing final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder( y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di) target = target_batch[:, di] gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze() step_loss = -torch.log(gold_probs + config.eps) if config.is_coverage: step_coverage_loss = torch.sum(torch.min(attn_dist, coverage), 1) step_loss = step_loss + config.cov_loss_wt * step_coverage_loss coverage = next_coverage step_mask = dec_padding_mask[:, di] step_loss = step_loss * step_mask step_losses.append(step_loss) sum_step_losses = torch.sum(torch.stack(step_losses, 1), 1) batch_avg_loss = sum_step_losses / dec_lens_var loss = torch.mean(batch_avg_loss) # print(loss) # print(type(loss)) # print(loss.data) # print(loss.data.item()) # return loss.data[0] return loss.data.item() def trainIters(self, n_iters, model_file_path=None): iter, running_avg_loss = self.setup_train(model_file_path) start = time.time() while iter < n_iters: batch = self.batcher.next_batch() loss = self.train_one_batch(batch, iter) val_loss = None if iter % 100 == 0: val_batch = self.val_batcher.next_batch() val_loss = self.eval_one_batch(val_batch) # print("val_loss",val_loss) self.scheduler.step() print("lr", self.scheduler.get_lr()) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter) iter += 1 if iter % 100 == 0: self.summary_writer.flush() print_interval = 1 if iter % print_interval == 0: if val_loss is not None: print( 'steps %d, seconds for %d batch: %.2f , loss: %f , eval_loss: %f' % (iter, print_interval, time.time() - start, loss, val_loss)) else: print('steps %d, seconds for %d batch: %.2f , loss: %f' % (iter, print_interval, time.time() - start, loss)) start = time.time() if iter % 1000 == 0: self.save_model(running_avg_loss, iter)
**params_torch) write_bitmap(f'{out_path}{i}_image_e{epoch:03d}.png', image, crop_size) #ob_val = lambda_img * objective(image, images_ref[i]) / len(cams_origins) ob_val, pyr_ob = objective(image, i) print( f"rendered image {i}: loss={ob_val.cpu().item()}, pyramid_losses={list(pyr_ob.data.cpu().numpy())}" ) ob_val /= len(cams_origins) loss_img += ob_val.item() ob_val.backward() opt.step() if lr_scheduler: print("lr = ", lr_scheduler.get_lr()[0]) lr_scheduler.step() if epoch == T_max and T_max > 0: lr_scheduler = CosineAnnealingLR(opt, T_max=T_max, eta_min=0.001) print(ek.hsum(params['SDF.bsdf.reflectance.data'])) try: sdf = params_torch['SDF.data'].data.cpu().numpy().reshape([sdf_res] * 3) sdf = skfmm.distance(sdf, sdf_scale / sdf_res) if epoch in sdf_res_squedule: pass # print(sdf.shape, sdf.flatten().shape) # sdf = double_sdf_res(sdf)
def train(self, x_train, y_train): idx = np.random.permutation(len(x_train)) print('samples:', len(x_train)) x_train = np.array(x_train)[idx] y_train = np.array(y_train)[idx] x_val = x_train[35000:] x_tr = x_train[:35000] y_val = y_train[35000:] y_tr = y_train[:35000] print(np.max(y_tr)) #optimizer = SGD(self.model.parameters(), lr=0.1, weight_decay=1e-4, nesterov=True, momentum=0.9) optimizer = Adam(self.model.parameters(), lr=0.001, weight_decay=1e-4) scheduler = ExponentialLR(optimizer, 0.98) loss_fn = nn.CrossEntropyLoss() train_loader = torch.utils.data.DataLoader(Loader(x_tr, y_tr), batch_size=256, shuffle=True) val_loader = torch.utils.data.DataLoader(Loader(x_val, y_val), batch_size=256, shuffle=False) best_acc = 0 for epoch in range(50): self.model.train() scheduler.step() loss_ = acc_ = cnt = 0 for i, (input, target) in enumerate(train_loader): output = self.model(input[0].cuda(), input[1].cuda()) optimizer.zero_grad() loss = loss_fn(output, target.cuda().view(-1)) loss.backward() optimizer.step() lr = scheduler.get_lr() pred = output.max(1)[1].cpu() match = torch.sum( pred == target.view(-1)).float() / target.shape[0] loss_ += loss.cpu().data.numpy() acc_ += match.data.numpy() cnt += 1 print('Epoch %2d: loss = %6.5f, training acc=%5.3f, lr=%f' % (epoch, loss_ / cnt, acc_ * 100 / cnt, lr[0])) loss_ = acc_ = 0 acc_ = 0 val_cnt = 0 self.model.eval() for i, (input, target) in enumerate(val_loader): with torch.no_grad(): output = self.model(input[0].cuda(), input[1].cuda()) pred = output.max(1)[1].cpu() acc_ += torch.sum( pred == target.view(-1)).float() / target.shape[0] val_cnt += 1 star = '*' if best_acc <= (acc_ / val_cnt) else '' print('val acc= %5.3f' % (acc_ / val_cnt) + star) torch.save(self.model.state_dict(), 'rnn_checkpoint.pth') if best_acc <= (acc_ / val_cnt): best_acc = (acc_ / val_cnt) torch.save(self.model.state_dict(), 'rnn_best_model.pth')