def eval_one_epoch(net, batch_generator, DEVICE=torch.device('cuda:0'), AttackMethod=None): net.eval() pbar = tqdm(batch_generator) clean_accuracy = AvgMeter() adv_accuracy = AvgMeter() pbar.set_description('Evaluating') for (data, label) in pbar: data = data.to(DEVICE) label = label.to(DEVICE) with torch.no_grad(): pred = net(data) acc = torch_accuracy(pred, label, (1, )) clean_accuracy.update(acc[0].item()) if AttackMethod is not None: adv_inp = AttackMethod.attack(net, data, label) with torch.no_grad(): pred = net(adv_inp) acc = torch_accuracy(pred, label, (1, )) adv_accuracy.update(acc[0].item()) pbar_dic = OrderedDict() pbar_dic['CleanAcc'] = '{:.2f}'.format(clean_accuracy.mean) pbar_dic['AdvAcc'] = '{:.2f}'.format(adv_accuracy.mean) pbar.set_postfix(pbar_dic) adv_acc = adv_accuracy.mean if AttackMethod is not None else 0 return clean_accuracy.mean, adv_acc
def my_eval_one_epoch(net, batch_generator, DEVICE=torch.device('cuda:0'), AttackMethod=None): net.eval() pbar = tqdm(batch_generator) clean_accuracy = AvgMeter() adv_accuracy = AvgMeter() correct_indices = None natural_indices = None pbar.set_description('Evaluating') for (data, label) in pbar: data = data.to(DEVICE) label = label.to(DEVICE) with torch.no_grad(): pred = net(data) predictions = np.argmax(pred.cpu().numpy(), axis=1) correct_labels = label.cpu().numpy() if natural_indices is None: natural_indices = np.where(predictions == correct_labels)[0] else: natural_indices = np.append( natural_indices, np.where(predictions == correct_labels)[0]) acc = torch_accuracy(pred, label, (1, )) clean_accuracy.update(acc[0].item()) if AttackMethod is not None: adv_inp = AttackMethod.attack(net, data, label) with torch.no_grad(): pred = net(adv_inp) predictions = np.argmax(pred.cpu().numpy(), axis=1) correct_labels = label.cpu().numpy() if correct_indices is None: correct_indices = np.where( predictions == correct_labels)[0] else: correct_indices = np.append( correct_indices, np.where(predictions == correct_labels)[0]) acc = my_torch_accuracy(pred, label, (1, )) adv_accuracy.update(acc[0].item()) pbar_dic = OrderedDict() pbar_dic['CleanAcc'] = '{:.2f}'.format(clean_accuracy.mean) pbar_dic['AdvAcc'] = '{:.2f}'.format(adv_accuracy.mean) pbar.set_postfix(pbar_dic) adv_acc = adv_accuracy.mean if AttackMethod is not None else 0 print('Natural Samples', natural_indices.shape) print('Adversarial Samples', correct_indices.shape) return clean_accuracy.mean, adv_acc
def train(self): for curr_epoch in range(self.start_epoch, self.end_epoch): train_loss_record = AvgMeter() self._train_per_epoch(curr_epoch, train_loss_record) # 根据周期修改学习率 if not self.arg_dict["sche_usebatch"]: self.sche.step() # 每个周期都进行保存测试,保存的是针对第curr_epoch+1周期的参数 save_checkpoint( model=self.net, optimizer=self.opti, scheduler=self.sche, amp=self.amp, exp_name=self.exp_name, current_epoch=curr_epoch + 1, full_net_path=self.path_dict["final_full_net"], state_net_path=self.path_dict["final_state_net"], ) # 保存参数 if self.arg_dict["use_amp"]: # https://github.com/NVIDIA/apex/issues/567 with self.amp.disable_casts(): construct_print( "When evaluating, we wish to evaluate in pure fp32.") self.test() else: self.test()
def test(self, save_pre): if self.only_test: self.resume_checkpoint(load_path=self.pth_path, mode='onlynet') self.net.eval() loader = self.te_loader pres = [AvgMeter() for _ in range(256)] recs = [AvgMeter() for _ in range(256)] meanfs = AvgMeter() maes = AvgMeter() tqdm_iter = tqdm(enumerate(loader), total=len(loader), leave=False) for test_batch_id, test_data in tqdm_iter: tqdm_iter.set_description( f"{self.model_name}: te=>{test_batch_id + 1}") with torch.no_grad(): in_imgs, in_names, in_mask_paths = test_data in_imgs = in_imgs.to(self.dev, non_blocking=True) outputs = self.net(in_imgs) outputs_np = outputs.cpu().detach() for item_id, out_item in enumerate(outputs_np): gimg_path = osp.join(in_mask_paths[item_id]) gt_img = Image.open(gimg_path).convert("L") out_img = self.to_pil(out_item).resize(gt_img.size) if save_pre: oimg_path = osp.join(self.save_path, in_names[item_id] + ".png") out_img.save(oimg_path) gt_img = np.asarray(gt_img) out_img = np.array(out_img) ps, rs, mae, meanf = cal_pr_mae_meanf(out_img, gt_img) for pidx, pdata in enumerate(zip(ps, rs)): p, r = pdata pres[pidx].update(p) recs[pidx].update(r) maes.update(mae) meanfs.update(meanf) maxf = cal_maxf([pre.avg for pre in pres], [rec.avg for rec in recs]) results = {"MAXF": maxf, "MEANF": meanfs.avg, "MAE": maes.avg} return results
def eval_one_epoch(net, batch_generator, DEVICE=torch.device('cuda:0')): net.eval() pbar = tqdm(batch_generator) clean_accuracy = AvgMeter() pbar.set_description('Evaluating') for (data, label) in pbar: data = data.to(DEVICE) label = label.to(DEVICE) with torch.no_grad(): pred = net(data) acc = torch_accuracy(pred, label, (1, )) clean_accuracy.update(acc[0].item()) pbar_dic = OrderedDict() pbar_dic['CleanAcc'] = '{:.2f}'.format(clean_accuracy.mean) pbar.set_postfix(pbar_dic) return clean_accuracy.mean
def train(args, model, device, train_loader, optimizer, epoch, descrip_str='Training'): model.train() pbar = tqdm(train_loader) pbar.set_description(descrip_str) CleanAccMeter = AvgMeter() TradesAccMeter = AvgMeter() for batch_idx, (data, target) in enumerate(pbar): data, target = data.to(device), target.to(device) optimizer.zero_grad() # calculate robust loss loss, cleanloss, klloss, cleanacc, tradesacc = trades_loss( model=model, x_natural=data, y=target, optimizer=optimizer, device=device, step_size=args.step_size, epsilon=args.epsilon, perturb_steps=args.num_steps, beta=args.beta, ) loss.backward() optimizer.step() CleanAccMeter.update(cleanacc) TradesAccMeter.update(tradesacc) pbar_dic = OrderedDict() pbar_dic['cleanloss'] = '{:.3f}'.format(cleanloss) pbar_dic['klloss'] = '{:.3f}'.format(klloss) pbar_dic['CleanAcc'] = '{:.2f}'.format(CleanAccMeter.mean) pbar_dic['TradesAcc'] = '{:.2f}'.format(TradesAccMeter.mean) pbar.set_postfix(pbar_dic)
def train_main(local_rank, cfg: BaseConfigByEpoch, net=None, train_dataloader=None, val_dataloader=None, show_variables=False, convbuilder=None, init_hdf5=None, no_l2_keywords='depth', gradient_mask=None, use_nesterov=False, tensorflow_style_init=False, load_weights_keyword=None, keyword_to_lr_mult=None, auto_continue=False, lasso_keyword_to_strength=None, save_hdf5_epochs=10000): if no_l2_keywords is None: no_l2_keywords = [] if type(no_l2_keywords) is not list: no_l2_keywords = [no_l2_keywords] ensure_dir(cfg.output_dir) ensure_dir(cfg.tb_dir) with Engine(local_rank=local_rank) as engine: engine.setup_log(name='train', log_dir=cfg.output_dir, file_name='log.txt') # ----------------------------- build model ------------------------------ if convbuilder is None: convbuilder = ConvBuilder(base_config=cfg) if net is None: net_fn = get_model_fn(cfg.dataset_name, cfg.network_type) model = net_fn(cfg, convbuilder) else: model = net model = model.cuda() # ----------------------------- model done ------------------------------ # ---------------------------- prepare data ------------------------- if train_dataloader is None: train_data = create_dataset(cfg.dataset_name, cfg.dataset_subset, cfg.global_batch_size, distributed=engine.distributed) if cfg.val_epoch_period > 0 and val_dataloader is None: val_data = create_dataset(cfg.dataset_name, 'val', global_batch_size=100, distributed=False) engine.echo('NOTE: Data prepared') engine.echo( 'NOTE: We have global_batch_size={} on {} GPUs, the allocated GPU memory is {}' .format(cfg.global_batch_size, torch.cuda.device_count(), torch.cuda.memory_allocated())) # ----------------------------- data done -------------------------------- # ------------------------ parepare optimizer, scheduler, criterion ------- optimizer = get_optimizer(engine, cfg, model, no_l2_keywords=no_l2_keywords, use_nesterov=use_nesterov, keyword_to_lr_mult=keyword_to_lr_mult) scheduler = get_lr_scheduler(cfg, optimizer) criterion = get_criterion(cfg).cuda() # --------------------------------- done ------------------------------- engine.register_state(scheduler=scheduler, model=model, optimizer=optimizer) if engine.distributed: torch.cuda.set_device(local_rank) engine.echo('Distributed training, device {}'.format(local_rank)) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], broadcast_buffers=False, ) else: assert torch.cuda.device_count() == 1 engine.echo('Single GPU training') if tensorflow_style_init: init_as_tensorflow(model) if cfg.init_weights: engine.load_checkpoint(cfg.init_weights) if init_hdf5: engine.load_hdf5(init_hdf5, load_weights_keyword=load_weights_keyword) if auto_continue: assert cfg.init_weights is None engine.load_checkpoint(get_last_checkpoint(cfg.output_dir)) if show_variables: engine.show_variables() # ------------ do training ---------------------------- # engine.log("\n\nStart training with pytorch version {}".format( torch.__version__)) iteration = engine.state.iteration iters_per_epoch = num_iters_per_epoch(cfg) max_iters = iters_per_epoch * cfg.max_epochs tb_writer = SummaryWriter(cfg.tb_dir) tb_tags = ['Top1-Acc', 'Top5-Acc', 'Loss'] model.train() done_epochs = iteration // iters_per_epoch last_epoch_done_iters = iteration % iters_per_epoch if done_epochs == 0 and last_epoch_done_iters == 0: engine.save_hdf5(os.path.join(cfg.output_dir, 'init.hdf5')) recorded_train_time = 0 recorded_train_examples = 0 collected_train_loss_sum = 0 collected_train_loss_count = 0 if gradient_mask is not None: gradient_mask_tensor = {} for name, value in gradient_mask.items(): gradient_mask_tensor[name] = torch.Tensor(value).cuda() else: gradient_mask_tensor = None for epoch in range(done_epochs, cfg.max_epochs): if engine.distributed and hasattr(train_data, 'train_sampler'): train_data.train_sampler.set_epoch(epoch) if epoch == done_epochs: pbar = tqdm(range(iters_per_epoch - last_epoch_done_iters)) else: pbar = tqdm(range(iters_per_epoch)) if epoch == 0 and local_rank == 0: val_during_train(epoch=epoch, iteration=iteration, tb_tags=tb_tags, engine=engine, model=model, val_data=val_data, criterion=criterion, descrip_str='Init', dataset_name=cfg.dataset_name, test_batch_size=TEST_BATCH_SIZE, tb_writer=tb_writer) top1 = AvgMeter() top5 = AvgMeter() losses = AvgMeter() discrip_str = 'Epoch-{}/{}'.format(epoch, cfg.max_epochs) pbar.set_description('Train' + discrip_str) for _ in pbar: start_time = time.time() data, label = load_cuda_data(train_data, dataset_name=cfg.dataset_name) # load_cuda_data(train_dataloader, cfg.dataset_name) data_time = time.time() - start_time if_accum_grad = ((iteration % cfg.grad_accum_iters) != 0) train_net_time_start = time.time() acc, acc5, loss = train_one_step( model, data, label, optimizer, criterion, if_accum_grad, gradient_mask_tensor=gradient_mask_tensor, lasso_keyword_to_strength=lasso_keyword_to_strength) train_net_time_end = time.time() if iteration > TRAIN_SPEED_START * max_iters and iteration < TRAIN_SPEED_END * max_iters: recorded_train_examples += cfg.global_batch_size recorded_train_time += train_net_time_end - train_net_time_start scheduler.step() for module in model.modules(): if hasattr(module, 'set_cur_iter'): module.set_cur_iter(iteration) if iteration % cfg.tb_iter_period == 0 and engine.world_rank == 0: for tag, value in zip( tb_tags, [acc.item(), acc5.item(), loss.item()]): tb_writer.add_scalars(tag, {'Train': value}, iteration) top1.update(acc.item()) top5.update(acc5.item()) losses.update(loss.item()) if epoch >= cfg.max_epochs - COLLECT_TRAIN_LOSS_EPOCHS: collected_train_loss_sum += loss.item() collected_train_loss_count += 1 pbar_dic = OrderedDict() pbar_dic['data-time'] = '{:.2f}'.format(data_time) pbar_dic['cur_iter'] = iteration pbar_dic['lr'] = scheduler.get_lr()[0] pbar_dic['top1'] = '{:.5f}'.format(top1.mean) pbar_dic['top5'] = '{:.5f}'.format(top5.mean) pbar_dic['loss'] = '{:.5f}'.format(losses.mean) pbar.set_postfix(pbar_dic) iteration += 1 if iteration >= max_iters or iteration % cfg.ckpt_iter_period == 0: engine.update_iteration(iteration) if (not engine.distributed) or (engine.distributed and engine.world_rank == 0): engine.save_and_link_checkpoint(cfg.output_dir) if iteration >= max_iters: break # do something after an epoch? engine.update_iteration(iteration) engine.save_latest_ckpt(cfg.output_dir) if (epoch + 1) % save_hdf5_epochs == 0: engine.save_hdf5( os.path.join(cfg.output_dir, 'epoch-{}.hdf5'.format(epoch))) if local_rank == 0 and \ cfg.val_epoch_period > 0 and (epoch >= cfg.max_epochs - 10 or epoch % cfg.val_epoch_period == 0): val_during_train(epoch=epoch, iteration=iteration, tb_tags=tb_tags, engine=engine, model=model, val_data=val_data, criterion=criterion, descrip_str=discrip_str, dataset_name=cfg.dataset_name, test_batch_size=TEST_BATCH_SIZE, tb_writer=tb_writer) if iteration >= max_iters: break # do something after the training if recorded_train_time > 0: exp_per_sec = recorded_train_examples / recorded_train_time else: exp_per_sec = 0 engine.log( 'TRAIN speed: from {} to {} iterations, batch_size={}, examples={}, total_net_time={:.4f}, examples/sec={}' .format(int(TRAIN_SPEED_START * max_iters), int(TRAIN_SPEED_END * max_iters), cfg.global_batch_size, recorded_train_examples, recorded_train_time, exp_per_sec)) if cfg.save_weights: engine.save_checkpoint(cfg.save_weights) print('NOTE: training finished, saved to {}'.format( cfg.save_weights)) engine.save_hdf5(os.path.join(cfg.output_dir, 'finish.hdf5')) if collected_train_loss_count > 0: engine.log( 'TRAIN LOSS collected over last {} epochs: {:.6f}'.format( COLLECT_TRAIN_LOSS_EPOCHS, collected_train_loss_sum / collected_train_loss_count))
def train(self): for curr_epoch in range(self.start_epoch, self.end_epoch): train_loss_record = AvgMeter() for train_batch_id, train_data in enumerate(self.tr_loader): curr_iter = curr_epoch * len(self.tr_loader) + train_batch_id self.opti.zero_grad() train_inputs, train_masks, *train_other_data = train_data train_inputs = train_inputs.to(self.dev, non_blocking=True) train_masks = train_masks.to(self.dev, non_blocking=True) train_preds = self.net(train_inputs) train_loss, loss_item_list = self.total_loss( train_preds, train_masks) train_loss.backward() self.opti.step() if self.args["sche_usebatch"]: if self.args["lr_type"] == "poly": self.sche.step(curr_iter + 1) else: raise NotImplementedError # 仅在累计的时候使用item()获取数据 train_iter_loss = train_loss.item() train_batch_size = train_inputs.size(0) train_loss_record.update(train_iter_loss, train_batch_size) # 显示tensorboard if (self.args["tb_update"] > 0 and (curr_iter + 1) % self.args["tb_update"] == 0): self.tb.add_scalar("data/trloss_avg", train_loss_record.avg, curr_iter) self.tb.add_scalar("data/trloss_iter", train_iter_loss, curr_iter) self.tb.add_scalar("data/trlr", self.opti.param_groups[0]["lr"], curr_iter) tr_tb_mask = make_grid(train_masks, nrow=train_batch_size, padding=5) self.tb.add_image("trmasks", tr_tb_mask, curr_iter) tr_tb_out_1 = make_grid(train_preds, nrow=train_batch_size, padding=5) self.tb.add_image("trsodout", tr_tb_out_1, curr_iter) # 记录每一次迭代的数据 if (self.args["print_freq"] > 0 and (curr_iter + 1) % self.args["print_freq"] == 0): log = ( f"[I:{curr_iter}/{self.iter_num}][E:{curr_epoch}:{self.end_epoch}]>" f"[{self.model_name}]" f"[Lr:{self.opti.param_groups[0]['lr']:.7f}]" f"[Avg:{train_loss_record.avg:.5f}|Cur:{train_iter_loss:.5f}|" f"{loss_item_list}]") print(log) make_log(self.path["tr_log"], log) # 根据周期修改学习率 if not self.args["sche_usebatch"]: if self.args["lr_type"] == "poly": self.sche.step(curr_epoch + 1) else: raise NotImplementedError # 每个周期都进行保存测试,保存的是针对第curr_epoch+1周期的参数 self.save_checkpoint( curr_epoch + 1, full_net_path=self.path['final_full_net'], state_net_path=self.path['final_state_net']) # 保存参数 total_results = {} for data_name, data_path in self.te_data_list.items(): construct_print(f"Testing with testset: {data_name}") self.te_loader, self.te_length = create_loader(data_path=data_path, mode='test', get_length=True) self.save_path = os.path.join(self.path["save"], data_name) if not os.path.exists(self.save_path): construct_print( f"{self.save_path} do not exist. Let's create it.") os.makedirs(self.save_path) results = self.test(save_pre=self.save_pre) msg = ( f"Results on the testset({data_name}:'{data_path}'): {results}" ) construct_print(msg) make_log(self.path["te_log"], msg) total_results[data_name.upper()] = results # save result into xlsx file. write_xlsx(self.model_name, total_results)
def __call__(self, teacher_net: TriNet, student_net: TriNet): opt = Adam(student_net.parameters(), lr=self.lr(self._gen), weight_decay=1e-5) milestones = list(range(self.args.first_milestone, self.args.num_epochs, self.args.step_milestone)) scheduler = lr_scheduler.MultiStepLR(opt, milestones=milestones, gamma=self.args.gamma) for e in range(self.args.num_epochs): if e % self.args.eval_epoch_interval == 0 and e > 0: self.evaluate(student_net) avm = AvgMeter(['kl', 'triplet', 'class', 'similarity', 'loss']) student_net.student_mode() teacher_net.teacher_mode() for x, y, cams in self.train_loader: x, y = x.to(self.device), y.to(self.device) x_ = torch.stack([x[i, torch.randperm(x.shape[1])] for i in range(x.shape[0])]) x_teacher, x_student = x, x_[:, :self.args.num_student_images] with torch.no_grad(): teacher_emb, teacher_logits = teacher_net(x_teacher, return_logits=True) opt.zero_grad() student_emb, student_logits = student_net(x_student, return_logits=True) kl_div_batch = self.distill_loss(teacher_logits, student_logits) similarity_loss_batch = self.similarity_loss(teacher_emb, student_emb) triplet_loss_batch = self.triplet_loss(student_emb, y) class_loss_batch = self.class_loss(student_logits, y) loss = (triplet_loss_batch + class_loss_batch) + \ self.args.lambda_coeff * (similarity_loss_batch) + \ self.args.kl_coeff * (kl_div_batch) avm.add([kl_div_batch.item(), triplet_loss_batch.item(), class_loss_batch.item(), similarity_loss_batch.item(), loss.item()]) loss.backward() opt.step() scheduler.step() if self._epoch % self.args.print_epoch_interval == 0: stats = avm() str_ = f"Epoch: {self._epoch}" for (l, m) in stats: str_ += f" - {l} {m:.2f}" self.saver.dump_metric_tb(m, self._epoch, 'losses', f"avg_{l}") self.saver.dump_metric_tb(opt.defaults['lr'], self._epoch, 'lr', 'lr') print(str_) self._epoch += 1 self._gen += 1 return student_net
def ding_train(cfg: BaseConfigByEpoch, net=None, train_dataloader=None, val_dataloader=None, show_variables=False, convbuilder=None): ensure_dir(cfg.output_dir) ensure_dir(cfg.tb_dir) with Engine(cfg) as engine: is_main_process = (engine.world_rank == 0) #TODO correct? logger = engine.setup_log(name='train', log_dir=cfg.output_dir, file_name='log.txt') # -- typical model components model, opt, scheduler, dataloder --# if net is None: net = get_model_fn(cfg.dataset_name, cfg.network_type) if convbuilder is None: convbuilder = ConvBuilder() model = net(cfg, convbuilder).cuda() if train_dataloader is None: train_dataloader = create_dataset(cfg.dataset_name, cfg.dataset_subset, cfg.global_batch_size) if cfg.val_epoch_period > 0 and val_dataloader is None: val_dataloader = create_dataset(cfg.dataset_name, 'val', batch_size=100) #TODO 100? print('NOTE: Data prepared') print( 'NOTE: We have global_batch_size={} on {} GPUs, the allocated GPU memory is {}' .format(cfg.global_batch_size, torch.cuda.device_count(), torch.cuda.memory_allocated())) # device = torch.device(cfg.device) # model.to(device) # model.cuda() optimizer = get_optimizer(cfg, model) scheduler = get_lr_scheduler(cfg, optimizer) criterion = get_criterion(cfg).cuda() # model, optimizer = amp.initialize(model, optimizer, opt_level="O0") engine.register_state(scheduler=scheduler, model=model, optimizer=optimizer) if engine.distributed: print('Distributed training, engine.world_rank={}'.format( engine.world_rank)) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[engine.world_rank], broadcast_buffers=False, ) # model = DistributedDataParallel(model, delay_allreduce=True) if engine.continue_state_object: engine.restore_checkpoint() else: if cfg.init_weights: engine.load_checkpoint(cfg.init_weights, is_restore=False) if show_variables: engine.show_variables() # ------------ do training ---------------------------- # logger.info("\n\nStart training with pytorch version {}".format( torch.__version__)) iteration = engine.state.iteration # done_epochs = iteration // num_train_examples_per_epoch(cfg.dataset_name) iters_per_epoch = num_iters_per_epoch(cfg) max_iters = iters_per_epoch * cfg.max_epochs tb_writer = SummaryWriter(cfg.tb_dir) tb_tags = ['Top1-Acc', 'Top5-Acc', 'Loss'] model.train() done_epochs = iteration // iters_per_epoch for epoch in range(done_epochs, cfg.max_epochs): pbar = tqdm(range(iters_per_epoch)) top1 = AvgMeter() top5 = AvgMeter() losses = AvgMeter() discrip_str = 'Epoch-{}/{}'.format(epoch, cfg.max_epochs) pbar.set_description('Train' + discrip_str) if cfg.val_epoch_period > 0 and epoch % cfg.val_epoch_period == 0: model.eval() val_iters = 500 if cfg.dataset_name == 'imagenet' else 100 # use batch_size=100 for val on ImagenNet and CIFAR eval_dict = run_eval(val_dataloader, val_iters, model, criterion, discrip_str, dataset_name=cfg.dataset_name) val_top1_value = eval_dict['top1'].item() val_top5_value = eval_dict['top5'].item() val_loss_value = eval_dict['loss'].item() for tag, value in zip( tb_tags, [val_top1_value, val_top5_value, val_loss_value]): tb_writer.add_scalars(tag, {'Val': value}, iteration) engine.log( 'validate at epoch {}, top1={:.5f}, top5={:.5f}, loss={:.6f}' .format(epoch, val_top1_value, val_top5_value, val_loss_value)) model.train() for _ in pbar: scheduler.step() start_time = time.time() data, label = load_cuda_data(train_dataloader, cfg.dataset_name) data_time = time.time() - start_time if_accum_grad = ((iteration % cfg.grad_accum_iters) != 0) acc, acc5, loss = train_one_step(model, data, label, optimizer, criterion, if_accum_grad) if iteration % cfg.tb_iter_period == 0 and is_main_process: for tag, value in zip( tb_tags, [acc.item(), acc5.item(), loss.item()]): tb_writer.add_scalars(tag, {'Train': value}, iteration) top1.update(acc.item()) top5.update(acc5.item()) losses.update(loss.item()) pbar_dic = OrderedDict() pbar_dic['data-time'] = '{:.2f}'.format(data_time) pbar_dic['cur_iter'] = iteration pbar_dic['lr'] = scheduler.get_lr()[0] pbar_dic['top1'] = '{:.5f}'.format(top1.mean) pbar_dic['top5'] = '{:.5f}'.format(top5.mean) pbar_dic['loss'] = '{:.5f}'.format(losses.mean) pbar.set_postfix(pbar_dic) if iteration >= max_iters or iteration % cfg.ckpt_iter_period == 0: engine.update_iteration(iteration) if (not engine.distributed) or (engine.distributed and is_main_process): engine.save_and_link_checkpoint(cfg.output_dir) iteration += 1 if iteration >= max_iters: break # do something after an epoch? if iteration >= max_iters: break # do something after the training engine.save_checkpoint(cfg.save_weights) print('NOTE: training finished, saved to {}'.format(cfg.save_weights))
def run_eval(ds_val, max_iters, net, criterion, discrip_str, dataset_name): pbar = tqdm(range(max_iters)) top1 = AvgMeter() top5 = AvgMeter() losses = AvgMeter() pbar.set_description('Validation' + discrip_str) total_net_time = 0 with torch.no_grad(): for iter_idx, i in enumerate(pbar): start_time = time.time() data, label = load_cuda_data(ds_val, dataset_name=dataset_name) data_time = time.time() - start_time net_time_start = time.time() pred = net(data) net_time_end = time.time() if iter_idx >= SPEED_TEST_SAMPLE_IGNORE_RATIO * max_iters: total_net_time += net_time_end - net_time_start loss = criterion(pred, label) acc, acc5 = torch_accuracy(pred, label, (1, 5)) top1.update(acc.item()) top5.update(acc5.item()) losses.update(loss.item()) pbar_dic = OrderedDict() pbar_dic['data-time'] = '{:.2f}'.format(data_time) pbar_dic['top1'] = '{:.5f}'.format(top1.mean) pbar_dic['top5'] = '{:.5f}'.format(top5.mean) pbar_dic['loss'] = '{:.5f}'.format(losses.mean) pbar.set_postfix(pbar_dic) metric_dic = {'top1':torch.tensor(top1.mean), 'top5':torch.tensor(top5.mean), 'loss':torch.tensor(losses.mean)} reduced_metirc_dic = reduce_loss_dict(metric_dic) # reduced_metirc_dic = my_reduce_dic(metric_dic) return reduced_metirc_dic, total_net_time
def csgd_train_main(local_rank, cfg: BaseConfigByEpoch, target_deps, succeeding_strategy, pacesetter_dict, centri_strength, pruned_weights, net=None, train_dataloader=None, val_dataloader=None, show_variables=False, convbuilder=None, init_hdf5=None, no_l2_keywords='depth', use_nesterov=False, load_weights_keyword=None, keyword_to_lr_mult=None, auto_continue=False, save_hdf5_epochs=10000): ensure_dir(cfg.output_dir) ensure_dir(cfg.tb_dir) clusters_save_path = os.path.join(cfg.output_dir, 'clusters.npy') with Engine(local_rank=local_rank) as engine: engine.setup_log(name='train', log_dir=cfg.output_dir, file_name='log.txt') # ----------------------------- build model ------------------------------ if convbuilder is None: convbuilder = ConvBuilder(base_config=cfg) if net is None: net_fn = get_model_fn(cfg.dataset_name, cfg.network_type) model = net_fn(cfg, convbuilder) else: model = net model = model.cuda() # ----------------------------- model done ------------------------------ # ---------------------------- prepare data ------------------------- if train_dataloader is None: train_data = create_dataset(cfg.dataset_name, cfg.dataset_subset, cfg.global_batch_size, distributed=engine.distributed) if cfg.val_epoch_period > 0 and val_dataloader is None: val_data = create_dataset(cfg.dataset_name, 'val', global_batch_size=100, distributed=False) engine.echo('NOTE: Data prepared') engine.echo( 'NOTE: We have global_batch_size={} on {} GPUs, the allocated GPU memory is {}' .format(cfg.global_batch_size, torch.cuda.device_count(), torch.cuda.memory_allocated())) # ----------------------------- data done -------------------------------- # ------------------------ parepare optimizer, scheduler, criterion ------- if no_l2_keywords is None: no_l2_keywords = [] if type(no_l2_keywords) is not list: no_l2_keywords = [no_l2_keywords] # For a target parameter, cancel its weight decay in optimizer, because the weight decay will be later encoded in the decay mat conv_idx = 0 for k, v in model.named_parameters(): if v.dim() != 4: continue print('prune {} from {} to {}'.format(conv_idx, target_deps[conv_idx], cfg.deps[conv_idx])) if target_deps[conv_idx] < cfg.deps[conv_idx]: no_l2_keywords.append(k.replace(KERNEL_KEYWORD, 'conv')) no_l2_keywords.append(k.replace(KERNEL_KEYWORD, 'bn')) conv_idx += 1 print('no l2: ', no_l2_keywords) optimizer = get_optimizer(engine, cfg, model, no_l2_keywords=no_l2_keywords, use_nesterov=use_nesterov, keyword_to_lr_mult=keyword_to_lr_mult) scheduler = get_lr_scheduler(cfg, optimizer) criterion = get_criterion(cfg).cuda() # --------------------------------- done ------------------------------- engine.register_state(scheduler=scheduler, model=model, optimizer=optimizer) if engine.distributed: torch.cuda.set_device(local_rank) engine.echo('Distributed training, device {}'.format(local_rank)) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], broadcast_buffers=False, ) else: assert torch.cuda.device_count() == 1 engine.echo('Single GPU training') if cfg.init_weights: engine.load_checkpoint(cfg.init_weights) if init_hdf5: engine.load_hdf5(init_hdf5, load_weights_keyword=load_weights_keyword) if auto_continue: assert cfg.init_weights is None engine.load_checkpoint(get_last_checkpoint(cfg.output_dir)) if show_variables: engine.show_variables() # ===================================== prepare the clusters and matrices for C-SGD ========== kernel_namedvalue_list = engine.get_all_conv_kernel_namedvalue_as_list( ) if os.path.exists(clusters_save_path): layer_idx_to_clusters = np.load(clusters_save_path, allow_pickle=True).item() else: if local_rank == 0: layer_idx_to_clusters = get_layer_idx_to_clusters( kernel_namedvalue_list=kernel_namedvalue_list, target_deps=target_deps, pacesetter_dict=pacesetter_dict) if pacesetter_dict is not None: for follower_idx, pacesetter_idx in pacesetter_dict.items( ): if pacesetter_idx in layer_idx_to_clusters: layer_idx_to_clusters[ follower_idx] = layer_idx_to_clusters[ pacesetter_idx] np.save(clusters_save_path, layer_idx_to_clusters) else: while not os.path.exists(clusters_save_path): time.sleep(10) print('sleep, waiting for process 0 to calculate clusters') layer_idx_to_clusters = np.load(clusters_save_path, allow_pickle=True).item() param_name_to_merge_matrix = generate_merge_matrix_for_kernel( deps=cfg.deps, layer_idx_to_clusters=layer_idx_to_clusters, kernel_namedvalue_list=kernel_namedvalue_list) add_vecs_to_merge_mat_dicts(param_name_to_merge_matrix) param_name_to_decay_matrix = generate_decay_matrix_for_kernel_and_vecs( deps=cfg.deps, layer_idx_to_clusters=layer_idx_to_clusters, kernel_namedvalue_list=kernel_namedvalue_list, weight_decay=cfg.weight_decay, weight_decay_bias=cfg.weight_decay_bias, centri_strength=centri_strength) print(param_name_to_decay_matrix.keys()) print(param_name_to_merge_matrix.keys()) conv_idx = 0 param_to_clusters = {} for k, v in model.named_parameters(): if v.dim() != 4: continue if conv_idx in layer_idx_to_clusters: for clsts in layer_idx_to_clusters[conv_idx]: if len(clsts) > 1: param_to_clusters[v] = layer_idx_to_clusters[conv_idx] break conv_idx += 1 # ============================================================================================ # ------------ do training ---------------------------- # engine.log("\n\nStart training with pytorch version {}".format( torch.__version__)) iteration = engine.state.iteration iters_per_epoch = num_iters_per_epoch(cfg) max_iters = iters_per_epoch * cfg.max_epochs tb_writer = SummaryWriter(cfg.tb_dir) tb_tags = ['Top1-Acc', 'Top5-Acc', 'Loss'] model.train() done_epochs = iteration // iters_per_epoch last_epoch_done_iters = iteration % iters_per_epoch if done_epochs == 0 and last_epoch_done_iters == 0: engine.save_hdf5(os.path.join(cfg.output_dir, 'init.hdf5')) recorded_train_time = 0 recorded_train_examples = 0 collected_train_loss_sum = 0 collected_train_loss_count = 0 for epoch in range(done_epochs, cfg.max_epochs): if engine.distributed and hasattr(train_data, 'train_sampler'): train_data.train_sampler.set_epoch(epoch) if epoch == done_epochs: pbar = tqdm(range(iters_per_epoch - last_epoch_done_iters)) else: pbar = tqdm(range(iters_per_epoch)) if epoch == 0 and local_rank == 0: val_during_train(epoch=epoch, iteration=iteration, tb_tags=tb_tags, engine=engine, model=model, val_data=val_data, criterion=criterion, descrip_str='Init', dataset_name=cfg.dataset_name, test_batch_size=TEST_BATCH_SIZE, tb_writer=tb_writer) top1 = AvgMeter() top5 = AvgMeter() losses = AvgMeter() discrip_str = 'Epoch-{}/{}'.format(epoch, cfg.max_epochs) pbar.set_description('Train' + discrip_str) for _ in pbar: start_time = time.time() data, label = load_cuda_data(train_data, dataset_name=cfg.dataset_name) # load_cuda_data(train_dataloader, cfg.dataset_name) data_time = time.time() - start_time train_net_time_start = time.time() acc, acc5, loss = train_one_step( model, data, label, optimizer, criterion, param_name_to_merge_matrix=param_name_to_merge_matrix, param_name_to_decay_matrix=param_name_to_decay_matrix) train_net_time_end = time.time() if iteration > TRAIN_SPEED_START * max_iters and iteration < TRAIN_SPEED_END * max_iters: recorded_train_examples += cfg.global_batch_size recorded_train_time += train_net_time_end - train_net_time_start scheduler.step() for module in model.modules(): if hasattr(module, 'set_cur_iter'): module.set_cur_iter(iteration) if iteration % cfg.tb_iter_period == 0 and engine.world_rank == 0: for tag, value in zip( tb_tags, [acc.item(), acc5.item(), loss.item()]): tb_writer.add_scalars(tag, {'Train': value}, iteration) deviation_sum = 0 for param, clusters in param_to_clusters.items(): pvalue = param.detach().cpu().numpy() for cl in clusters: if len(cl) == 1: continue selected = pvalue[cl, :, :, :] mean_kernel = np.mean(selected, axis=0, keepdims=True) diff = selected - mean_kernel deviation_sum += np.sum(diff**2) tb_writer.add_scalars('deviation_sum', {'Train': deviation_sum}, iteration) top1.update(acc.item()) top5.update(acc5.item()) losses.update(loss.item()) if epoch >= cfg.max_epochs - COLLECT_TRAIN_LOSS_EPOCHS: collected_train_loss_sum += loss.item() collected_train_loss_count += 1 pbar_dic = OrderedDict() pbar_dic['data-time'] = '{:.2f}'.format(data_time) pbar_dic['cur_iter'] = iteration pbar_dic['lr'] = scheduler.get_lr()[0] pbar_dic['top1'] = '{:.5f}'.format(top1.mean) pbar_dic['top5'] = '{:.5f}'.format(top5.mean) pbar_dic['loss'] = '{:.5f}'.format(losses.mean) pbar.set_postfix(pbar_dic) iteration += 1 if iteration >= max_iters or iteration % cfg.ckpt_iter_period == 0: engine.update_iteration(iteration) if (not engine.distributed) or (engine.distributed and engine.world_rank == 0): engine.save_and_link_checkpoint(cfg.output_dir) if iteration >= max_iters: break # do something after an epoch? engine.update_iteration(iteration) engine.save_latest_ckpt(cfg.output_dir) if (epoch + 1) % save_hdf5_epochs == 0: engine.save_hdf5( os.path.join(cfg.output_dir, 'epoch-{}.hdf5'.format(epoch))) if local_rank == 0 and \ cfg.val_epoch_period > 0 and (epoch >= cfg.max_epochs - 10 or epoch % cfg.val_epoch_period == 0): val_during_train(epoch=epoch, iteration=iteration, tb_tags=tb_tags, engine=engine, model=model, val_data=val_data, criterion=criterion, descrip_str=discrip_str, dataset_name=cfg.dataset_name, test_batch_size=TEST_BATCH_SIZE, tb_writer=tb_writer) if iteration >= max_iters: break # do something after the training if recorded_train_time > 0: exp_per_sec = recorded_train_examples / recorded_train_time else: exp_per_sec = 0 engine.log( 'TRAIN speed: from {} to {} iterations, batch_size={}, examples={}, total_net_time={:.4f}, examples/sec={}' .format(int(TRAIN_SPEED_START * max_iters), int(TRAIN_SPEED_END * max_iters), cfg.global_batch_size, recorded_train_examples, recorded_train_time, exp_per_sec)) if cfg.save_weights: engine.save_checkpoint(cfg.save_weights) print('NOTE: training finished, saved to {}'.format( cfg.save_weights)) engine.save_hdf5(os.path.join(cfg.output_dir, 'finish.hdf5')) if collected_train_loss_count > 0: engine.log( 'TRAIN LOSS collected over last {} epochs: {:.6f}'.format( COLLECT_TRAIN_LOSS_EPOCHS, collected_train_loss_sum / collected_train_loss_count)) if local_rank == 0: csgd_prune_and_save(engine=engine, layer_idx_to_clusters=layer_idx_to_clusters, save_file=pruned_weights, succeeding_strategy=succeeding_strategy, new_deps=target_deps)
def aofp_train_main(local_rank, target_layers, succ_strategy, warmup_iterations, aofp_batches_per_half, flops_func, cfg: BaseConfigByEpoch, net=None, train_dataloader=None, val_dataloader=None, show_variables=False, convbuilder=None, init_hdf5=None, no_l2_keywords='depth', gradient_mask=None, use_nesterov=False, tensorflow_style_init=False, keyword_to_lr_mult=None, auto_continue=False, lasso_keyword_to_strength=None, save_hdf5_epochs=10000, remain_flops_ratio=0): if no_l2_keywords is None: no_l2_keywords = [] if type(no_l2_keywords) is not list: no_l2_keywords = [no_l2_keywords] ensure_dir(cfg.output_dir) ensure_dir(cfg.tb_dir) with Engine(local_rank=local_rank) as engine: engine.setup_log(name='train', log_dir=cfg.output_dir, file_name='log.txt') # ----------------------------- build model ------------------------------ if convbuilder is None: convbuilder = ConvBuilder(base_config=cfg) if net is None: net_fn = get_model_fn(cfg.dataset_name, cfg.network_type) model = net_fn(cfg, convbuilder) else: model = net model = model.cuda() # ----------------------------- model done ------------------------------ # ---------------------------- prepare data ------------------------- if train_dataloader is None: train_data = create_dataset(cfg.dataset_name, cfg.dataset_subset, cfg.global_batch_size, distributed=engine.distributed) if cfg.val_epoch_period > 0 and val_dataloader is None: val_data = create_dataset(cfg.dataset_name, 'val', global_batch_size=100, distributed=False) engine.echo('NOTE: Data prepared') engine.echo( 'NOTE: We have global_batch_size={} on {} GPUs, the allocated GPU memory is {}' .format(cfg.global_batch_size, torch.cuda.device_count(), torch.cuda.memory_allocated())) # ----------------------------- data done -------------------------------- # ------------------------ parepare optimizer, scheduler, criterion ------- optimizer = get_optimizer(engine, cfg, model, no_l2_keywords=no_l2_keywords, use_nesterov=use_nesterov, keyword_to_lr_mult=keyword_to_lr_mult) scheduler = get_lr_scheduler(cfg, optimizer) criterion = get_criterion(cfg).cuda() # --------------------------------- done ------------------------------- engine.register_state(scheduler=scheduler, model=model, optimizer=optimizer) if engine.distributed: torch.cuda.set_device(local_rank) engine.echo('Distributed training, device {}'.format(local_rank)) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], broadcast_buffers=False, ) else: assert torch.cuda.device_count() == 1 engine.echo('Single GPU training') if tensorflow_style_init: init_as_tensorflow(model) if cfg.init_weights: engine.load_checkpoint(cfg.init_weights) if init_hdf5: engine.load_part('base_path.', init_hdf5) if auto_continue: assert cfg.init_weights is None engine.load_checkpoint(get_last_checkpoint(cfg.output_dir)) if show_variables: engine.show_variables() # ------------ do training ---------------------------- # engine.log("\n\nStart training with pytorch version {}".format( torch.__version__)) iteration = engine.state.iteration iters_per_epoch = num_iters_per_epoch(cfg) max_iters = iters_per_epoch * cfg.max_epochs tb_writer = SummaryWriter(cfg.tb_dir) tb_tags = ['Top1-Acc', 'Top5-Acc', 'Loss'] model.train() done_epochs = iteration // iters_per_epoch last_epoch_done_iters = iteration % iters_per_epoch if done_epochs == 0 and last_epoch_done_iters == 0: engine.save_hdf5(os.path.join(cfg.output_dir, 'init.hdf5')) recorded_train_time = 0 recorded_train_examples = 0 collected_train_loss_sum = 0 collected_train_loss_count = 0 if gradient_mask is not None: gradient_mask_tensor = {} for name, value in gradient_mask.items(): gradient_mask_tensor[name] = torch.Tensor(value).cuda() else: gradient_mask_tensor = None ######################### aofp _init_interval = aofp_batches_per_half // len(target_layers) layer_to_start_iter = { i: (_init_interval * i + warmup_iterations) for i in target_layers } print( 'the initial layer_to_start_iter = {}'.format(layer_to_start_iter)) # 0. get all the AOFPLayers layer_idx_to_module = {} for submodule in model.modules(): if hasattr(submodule, 'score_mask') or hasattr( submodule, 't_value'): layer_idx_to_module[submodule.conv_idx] = submodule print(layer_idx_to_module) ###################################### for epoch in range(done_epochs, cfg.max_epochs): if engine.distributed and hasattr(train_data, 'train_sampler'): train_data.train_sampler.set_epoch(epoch) if epoch == done_epochs: pbar = tqdm(range(iters_per_epoch - last_epoch_done_iters)) else: pbar = tqdm(range(iters_per_epoch)) if epoch == 0 and local_rank == 0: val_during_train(epoch=epoch, iteration=iteration, tb_tags=tb_tags, engine=engine, model=model, val_data=val_data, criterion=criterion, descrip_str='Init', dataset_name=cfg.dataset_name, test_batch_size=TEST_BATCH_SIZE, tb_writer=tb_writer) top1 = AvgMeter() top5 = AvgMeter() losses = AvgMeter() discrip_str = 'Epoch-{}/{}'.format(epoch, cfg.max_epochs) pbar.set_description('Train' + discrip_str) for _ in pbar: start_time = time.time() data, label = load_cuda_data(train_data, dataset_name=cfg.dataset_name) # load_cuda_data(train_dataloader, cfg.dataset_name) data_time = time.time() - start_time if_accum_grad = ((iteration % cfg.grad_accum_iters) != 0) train_net_time_start = time.time() ############ aofp # 1. see if it is time to start on every layer # 2. forward and accumulate # 3. if a half on some layer is finished, do something # ---- fetch its accumulated t vectors, analyze the first 'granu' elements # ---- if good enough, set the base mask, reset the search space # ---- elif granu == 1, do nothing # ---- else, granu /= 2, reset the search space for layer_idx, start_iter in layer_to_start_iter.items(): if start_iter == iteration: layer_idx_to_module[layer_idx].start_aofp(iteration) acc, acc5, loss = train_one_step( model, data, label, optimizer, criterion, if_accum_grad, gradient_mask_tensor=gradient_mask_tensor, lasso_keyword_to_strength=lasso_keyword_to_strength) for layer_idx, aofp_layer in layer_idx_to_module.items(): # accumulate if layer_idx not in succ_strategy: continue follow_layer_idx = succ_strategy[layer_idx] if follow_layer_idx not in layer_idx_to_module: continue t_value = layer_idx_to_module[follow_layer_idx].t_value aofp_layer.accumulate_t_value(t_value) if aofp_layer.finished_a_half(iteration): aofp_layer.halve_or_stop(iteration) ################################### train_net_time_end = time.time() if iteration > TRAIN_SPEED_START * max_iters and iteration < TRAIN_SPEED_END * max_iters: recorded_train_examples += cfg.global_batch_size recorded_train_time += train_net_time_end - train_net_time_start scheduler.step() for module in model.modules(): if hasattr(module, 'set_cur_iter'): module.set_cur_iter(iteration) if iteration % cfg.tb_iter_period == 0 and engine.world_rank == 0: for tag, value in zip( tb_tags, [acc.item(), acc5.item(), loss.item()]): tb_writer.add_scalars(tag, {'Train': value}, iteration) top1.update(acc.item()) top5.update(acc5.item()) losses.update(loss.item()) if epoch >= cfg.max_epochs - COLLECT_TRAIN_LOSS_EPOCHS: collected_train_loss_sum += loss.item() collected_train_loss_count += 1 pbar_dic = OrderedDict() pbar_dic['data-time'] = '{:.2f}'.format(data_time) pbar_dic['cur_iter'] = iteration pbar_dic['lr'] = scheduler.get_lr()[0] pbar_dic['top1'] = '{:.5f}'.format(top1.mean) pbar_dic['top5'] = '{:.5f}'.format(top5.mean) pbar_dic['loss'] = '{:.5f}'.format(losses.mean) pbar.set_postfix(pbar_dic) iteration += 1 if iteration >= max_iters or iteration % cfg.ckpt_iter_period == 0: engine.update_iteration(iteration) if (not engine.distributed) or (engine.distributed and engine.world_rank == 0): engine.save_and_link_checkpoint(cfg.output_dir) if iteration >= max_iters: break # do something after an epoch? engine.update_iteration(iteration) engine.save_latest_ckpt(cfg.output_dir) if (epoch + 1) % save_hdf5_epochs == 0: engine.save_hdf5( os.path.join(cfg.output_dir, 'epoch-{}.hdf5'.format(epoch))) if local_rank == 0 and \ cfg.val_epoch_period > 0 and (epoch >= cfg.max_epochs - 10 or epoch % cfg.val_epoch_period == 0): val_during_train(epoch=epoch, iteration=iteration, tb_tags=tb_tags, engine=engine, model=model, val_data=val_data, criterion=criterion, descrip_str=discrip_str, dataset_name=cfg.dataset_name, test_batch_size=TEST_BATCH_SIZE, tb_writer=tb_writer) cur_deps = np.array(cfg.deps) for submodule in model.modules(): if hasattr(submodule, 'base_mask'): cur_deps[submodule.conv_idx] = np.sum( submodule.base_mask.cpu().numpy() == 1) origin_flops = flops_func(cfg.deps) cur_flops = flops_func(cur_deps) remain_ratio = cur_flops / origin_flops if local_rank == 0: print('##########################') print('origin deps ', cfg.deps) print('cur deps ', cur_deps) print('remain flops ratio = ', remain_ratio, 'the target is ', remain_flops_ratio) print('##########################') if remain_ratio < remain_flops_ratio: break if iteration >= max_iters: break # do something after the training if recorded_train_time > 0: exp_per_sec = recorded_train_examples / recorded_train_time else: exp_per_sec = 0 engine.log( 'TRAIN speed: from {} to {} iterations, batch_size={}, examples={}, total_net_time={:.4f}, examples/sec={}' .format(int(TRAIN_SPEED_START * max_iters), int(TRAIN_SPEED_END * max_iters), cfg.global_batch_size, recorded_train_examples, recorded_train_time, exp_per_sec)) if cfg.save_weights: engine.save_checkpoint(cfg.save_weights) print('NOTE: training finished, saved to {}'.format( cfg.save_weights)) engine.save_hdf5(os.path.join(cfg.output_dir, 'finish.hdf5')) if collected_train_loss_count > 0: engine.log( 'TRAIN LOSS collected over last {} epochs: {:.6f}'.format( COLLECT_TRAIN_LOSS_EPOCHS, collected_train_loss_sum / collected_train_loss_count)) final_deps = aofp_prune(model, origin_deps=cfg.deps, succ_strategy=succ_strategy, save_path=os.path.join(cfg.output_dir, 'finish_pruned.hdf5')) origin_flops = flops_func(cfg.deps) cur_flops = flops_func(final_deps) engine.log( '##################################################################' ) engine.log(cfg.network_type) engine.log('origin width: {} , flops {} '.format( cfg.deps, origin_flops)) engine.log('final width: {}, flops {} '.format(final_deps, cur_flops)) engine.log('flops reduction: {}'.format(1 - cur_flops / origin_flops)) return final_deps
def ding_train(cfg:BaseConfigByEpoch, net=None, train_dataloader=None, val_dataloader=None, show_variables=False, convbuilder=None, beginning_msg=None, init_hdf5=None, no_l2_keywords=None, gradient_mask=None, use_nesterov=False): # LOCAL_RANK = 0 # # num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 # is_distributed = num_gpus > 1 # # if is_distributed: # torch.cuda.set_device(LOCAL_RANK) # torch.distributed.init_process_group( # backend="nccl", init_method="env://" # ) # synchronize() # # torch.backends.cudnn.benchmark = True ensure_dir(cfg.output_dir) ensure_dir(cfg.tb_dir) with Engine() as engine: is_main_process = (engine.world_rank == 0) #TODO correct? logger = engine.setup_log( name='train', log_dir=cfg.output_dir, file_name='log.txt') # -- typical model components model, opt, scheduler, dataloder --# if net is None: net = get_model_fn(cfg.dataset_name, cfg.network_type) if convbuilder is None: convbuilder = ConvBuilder(base_config=cfg) model = net(cfg, convbuilder).cuda() if train_dataloader is None: train_dataloader = create_dataset(cfg.dataset_name, cfg.dataset_subset, cfg.global_batch_size) if cfg.val_epoch_period > 0 and val_dataloader is None: val_dataloader = create_dataset(cfg.dataset_name, 'val', batch_size=100) #TODO 100? print('NOTE: Data prepared') print('NOTE: We have global_batch_size={} on {} GPUs, the allocated GPU memory is {}'.format(cfg.global_batch_size, torch.cuda.device_count(), torch.cuda.memory_allocated())) # device = torch.device(cfg.device) # model.to(device) # model.cuda() if no_l2_keywords is None: no_l2_keywords = [] optimizer = get_optimizer(cfg, model, no_l2_keywords=no_l2_keywords, use_nesterov=use_nesterov) scheduler = get_lr_scheduler(cfg, optimizer) criterion = get_criterion(cfg).cuda() # model, optimizer = amp.initialize(model, optimizer, opt_level="O0") engine.register_state( scheduler=scheduler, model=model, optimizer=optimizer) if engine.distributed: print('Distributed training, engine.world_rank={}'.format(engine.world_rank)) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[engine.world_rank], broadcast_buffers=False, ) # model = DistributedDataParallel(model, delay_allreduce=True) elif torch.cuda.device_count() > 1: print('Single machine multiple GPU training') model = torch.nn.parallel.DataParallel(model) # for k, v in model.named_parameters(): # if v.dim() in [2, 4]: # torch.nn.init.xavier_normal_(v) # print('init {} as xavier_normal'.format(k)) # if 'bias' in k and 'bn' not in k.lower(): # torch.nn.init.zeros_(v) # print('init {} as zero'.format(k)) if cfg.init_weights: engine.load_checkpoint(cfg.init_weights, is_restore=True) if init_hdf5: engine.load_hdf5(init_hdf5) if show_variables: engine.show_variables() # ------------ do training ---------------------------- # if beginning_msg: engine.log(beginning_msg) logger.info("\n\nStart training with pytorch version {}".format(torch.__version__)) iteration = engine.state.iteration # done_epochs = iteration // num_train_examples_per_epoch(cfg.dataset_name) iters_per_epoch = num_iters_per_epoch(cfg) max_iters = iters_per_epoch * cfg.max_epochs tb_writer = SummaryWriter(cfg.tb_dir) tb_tags = ['Top1-Acc', 'Top5-Acc', 'Loss'] model.train() done_epochs = iteration // iters_per_epoch engine.save_hdf5(os.path.join(cfg.output_dir, 'init.hdf5')) # summary(model=model, input_size=(224, 224) if cfg.dataset_name == 'imagenet' else (32, 32), batch_size=cfg.global_batch_size) recorded_train_time = 0 recorded_train_examples = 0 if gradient_mask is not None: gradient_mask_tensor = {} for name, value in gradient_mask.items(): gradient_mask_tensor[name] = torch.Tensor(value).cuda() else: gradient_mask_tensor = None for epoch in range(done_epochs, cfg.max_epochs): pbar = tqdm(range(iters_per_epoch)) top1 = AvgMeter() top5 = AvgMeter() losses = AvgMeter() discrip_str = 'Epoch-{}/{}'.format(epoch, cfg.max_epochs) pbar.set_description('Train' + discrip_str) if cfg.val_epoch_period > 0 and epoch % cfg.val_epoch_period == 0: model.eval() val_iters = 500 if cfg.dataset_name == 'imagenet' else 100 # use batch_size=100 for val on ImagenNet and CIFAR eval_dict, _ = run_eval(val_dataloader, val_iters, model, criterion, discrip_str, dataset_name=cfg.dataset_name) val_top1_value = eval_dict['top1'].item() val_top5_value = eval_dict['top5'].item() val_loss_value = eval_dict['loss'].item() for tag, value in zip(tb_tags, [val_top1_value, val_top5_value, val_loss_value]): tb_writer.add_scalars(tag, {'Val': value}, iteration) engine.log('validate at epoch {}, top1={:.5f}, top5={:.5f}, loss={:.6f}'.format(epoch, val_top1_value, val_top5_value, val_loss_value)) model.train() for _ in pbar: start_time = time.time() data, label = load_cuda_data(train_dataloader, cfg.dataset_name) data_time = time.time() - start_time if_accum_grad = ((iteration % cfg.grad_accum_iters) != 0) train_net_time_start = time.time() acc, acc5, loss = train_one_step(model, data, label, optimizer, criterion, if_accum_grad, gradient_mask_tensor=gradient_mask_tensor) train_net_time_end = time.time() if iteration > TRAIN_SPEED_START * max_iters and iteration < TRAIN_SPEED_END * max_iters: recorded_train_examples += cfg.global_batch_size recorded_train_time += train_net_time_end - train_net_time_start scheduler.step() if iteration % cfg.tb_iter_period == 0 and is_main_process: for tag, value in zip(tb_tags, [acc.item(), acc5.item(), loss.item()]): tb_writer.add_scalars(tag, {'Train': value}, iteration) top1.update(acc.item()) top5.update(acc5.item()) losses.update(loss.item()) pbar_dic = OrderedDict() pbar_dic['data-time'] = '{:.2f}'.format(data_time) pbar_dic['cur_iter'] = iteration pbar_dic['lr'] = scheduler.get_lr()[0] pbar_dic['top1'] = '{:.5f}'.format(top1.mean) pbar_dic['top5'] = '{:.5f}'.format(top5.mean) pbar_dic['loss'] = '{:.5f}'.format(losses.mean) pbar.set_postfix(pbar_dic) if iteration >= max_iters or iteration % cfg.ckpt_iter_period == 0: engine.update_iteration(iteration) if (not engine.distributed) or (engine.distributed and is_main_process): engine.save_and_link_checkpoint(cfg.output_dir) iteration += 1 if iteration >= max_iters: break # do something after an epoch? if iteration >= max_iters: break # do something after the training if recorded_train_time > 0: exp_per_sec = recorded_train_examples / recorded_train_time else: exp_per_sec = 0 engine.log( 'TRAIN speed: from {} to {} iterations, batch_size={}, examples={}, total_net_time={:.4f}, examples/sec={}' .format(int(TRAIN_SPEED_START * max_iters), int(TRAIN_SPEED_END * max_iters), cfg.global_batch_size, recorded_train_examples, recorded_train_time, exp_per_sec)) if cfg.save_weights: engine.save_checkpoint(cfg.save_weights) print('NOTE: training finished, saved to {}'.format(cfg.save_weights)) engine.save_hdf5(os.path.join(cfg.output_dir, 'finish.hdf5'))
def _test_process(self, save_pre): loader = self.te_loader # pres = [AvgMeter() for _ in range(256)] # recs = [AvgMeter() for _ in range(256)] pres = list() recs = list() meanfs = AvgMeter() maes = AvgMeter() # Measures from Saliency toolbox measures = [ 'Wgt-F', 'E-measure', 'S-measure', 'Mod-Max-F', 'Mod-Adp-F', 'Mod-Wgt-F' ] beta = np.sqrt( 0.3) # default beta parameter used in the adaptive F-measure gt_threshold = 0.5 # The threshold that is used to binrize ground truth maps. values = dict() # initialize measure value dictionary pr = dict() # initialize precision recall dictionary prm = dict() # initialize precision recall dictionary for Mod-Max-F for idx in measures: values[idx] = list() if idx == 'Max-F': pr['Precision'] = list() pr['Recall'] = list() if idx == 'Mod-Max-F': prm['Precision'] = list() prm['Recall'] = list() tqdm_iter = tqdm(enumerate(loader), total=len(loader), leave=False) for test_batch_id, test_data in tqdm_iter: tqdm_iter.set_description( f"{self.exp_name}: te=>{test_batch_id + 1}") in_imgs, in_mask_paths, in_names = test_data generate_out_imgs = False if self.arg_dict["resume_mode"] == "measure": # Check if prediction masks have already been created for item_id, in_fname in enumerate(in_names): oimg_path = os.path.join(self.save_path, in_fname + ".png") if not os.path.exists(oimg_path): # Out image doesn't exist yet generate_out_imgs = True break else: generate_out_imgs = True if generate_out_imgs: with torch.no_grad(): in_imgs = in_imgs.to(self.dev, non_blocking=True) outputs = self.net(in_imgs) outputs_np = outputs.sigmoid().cpu().detach() for item_id, in_fname in enumerate(in_names): oimg_path = os.path.join(self.save_path, in_fname + ".png") gimg_path = os.path.join(in_mask_paths[item_id]) gt_img = Image.open(gimg_path).convert("L") if self.arg_dict[ "resume_mode"] == "measure" and generate_out_imgs == False: out_img = Image.open(oimg_path).convert("L") else: out_item = outputs_np[item_id] out_img = self.to_pil(out_item).resize( gt_img.size, resample=Image.NEAREST) if save_pre and generate_out_imgs: out_img.save(oimg_path) gt_img = np.array(gt_img) out_img = np.array(out_img) # Gather images again using Saliency toolboxes import methods # These images will be grayscale floats between 0 and 1 sm = out_img.astype(np.float32) if sm.max() == sm.min(): sm = sm / 255 else: sm = (sm - sm.min()) / (sm.max() - sm.min()) gt = np.zeros_like(gt_img, dtype=np.float32) gt[gt_img > 256 * gt_threshold] = 1 ps, rs, mae, meanf = cal_pr_mae_meanf(out_img, gt_img) pres.append(ps) recs.append(rs) # for pidx, pdata in enumerate(zip(ps, rs)): # p, r = pdata # pres[pidx].update(p) # recs[pidx].update(r) maes.update(mae) meanfs.update(meanf) # Compute other measures using the Saliency Toolbox if 'MAE2' in measures: values['MAE2'].append(mean_square_error(gt, sm)) if 'E-measure' in measures: values['E-measure'].append(e_measure(gt, sm)) if 'S-measure' in measures: values['S-measure'].append(s_measure(gt, sm)) if 'Adp-F' in measures: values['Adp-F'].append( adaptive_fmeasure(gt, sm, beta, allowBlackMask=False)) if 'Mod-Adp-F' in measures: values['Mod-Adp-F'].append( adaptive_fmeasure(gt, sm, beta, allowBlackMask=True)) if 'Wgt-F' in measures: values['Wgt-F'].append( weighted_fmeasure(gt, sm, allowBlackMask=False)) if 'Mod-Wgt-F' in measures: values['Mod-Wgt-F'].append( weighted_fmeasure(gt, sm, allowBlackMask=True)) if 'Max-F' in measures: prec, recall = prec_recall( gt, sm, 256, allowBlackMask=False) # 256 thresholds between 0 and 1 # Check if precision recall curve exists if len(prec) != 0 and len(recall) != 0: pr['Precision'].append(prec) pr['Recall'].append(recall) if 'Mod-Max-F' in measures: prec, recall = prec_recall( gt, sm, 256, allowBlackMask=True) # 256 thresholds between 0 and 1 # Check if precision recall curve exists if len(prec) != 0 and len(recall) != 0: prm['Precision'].append(prec) prm['Recall'].append(recall) # Compute total measures over all images if 'MAE2' in measures: values['MAE2'] = np.mean(values['MAE2']) if 'E-measure' in measures: values['E-measure'] = np.mean(values['E-measure']) if 'S-measure' in measures: values['S-measure'] = np.mean(values['S-measure']) if 'Adp-F' in measures: values['Adp-F'] = np.mean(values['Adp-F']) if 'Mod-Adp-F' in measures: values['Mod-Adp-F'] = np.mean(values['Mod-Adp-F']) if 'Wgt-F' in measures: values['Wgt-F'] = np.mean(values['Wgt-F']) if 'Mod-Wgt-F' in measures: values['Mod-Wgt-F'] = np.mean(values['Mod-Wgt-F']) if 'Max-F' in measures: if len(pr['Precision']) > 0: pr['Precision'] = np.mean(np.hstack(pr['Precision'][:]), 1) pr['Recall'] = np.mean(np.hstack(pr['Recall'][:]), 1) f_measures = (1 + beta**2) * pr['Precision'] * pr['Recall'] / ( beta**2 * pr['Precision'] + pr['Recall']) # Remove any NaN values to allow calculation f_measures[np.isnan(f_measures)] = 0 values['Max-F'] = np.max(f_measures) else: # There were likely no images found in the directory, so pr['Precision'] # is an empty set values['Max-F'] = 0 if 'Mod-Max-F' in measures: if len(prm['Precision']) > 0: prm['Precision'] = np.mean(np.hstack(prm['Precision'][:]), 1) prm['Recall'] = np.mean(np.hstack(prm['Recall'][:]), 1) f_measures = (1 + beta**2) * prm['Precision'] * prm['Recall'] / ( beta**2 * prm['Precision'] + prm['Recall']) # Remove any NaN values to allow calculation f_measures[np.isnan(f_measures)] = 0 values['Mod-Max-F'] = np.max(f_measures) else: # There were likely no images found in the directory, so prm['Precision'] # is an empty set values['Mod-Max-F'] = 0 # maxf = cal_maxf([pre.avg for pre in pres], [rec.avg for rec in recs]) # Calculate MAXF using original algorithm pr, re curves pres = np.mean(np.hstack(pres[:]), 1) recs = np.mean(np.hstack(recs[:]), 1) f_measures = (1 + beta**2) * pres * recs / (beta**2 * pres + recs) # Remove any NaN values to allow calculation f_measures[np.isnan(f_measures)] = 0 maxf = np.max(f_measures) results = { "MAXF": maxf, "MEANF": meanfs.avg, "MAE": maes.avg, **values } return results
def csgd_train_and_prune(cfg: BaseConfigByEpoch, target_deps, centri_strength, pacesetter_dict, succeeding_strategy, pruned_weights, net=None, train_dataloader=None, val_dataloader=None, show_variables=False, convbuilder=None, beginning_msg=None, init_hdf5=None, no_l2_keywords=None, use_nesterov=False, tensorflow_style_init=False): ensure_dir(cfg.output_dir) ensure_dir(cfg.tb_dir) clusters_save_path = os.path.join(cfg.output_dir, 'clusters.npy') with Engine() as engine: is_main_process = (engine.world_rank == 0) #TODO correct? logger = engine.setup_log(name='train', log_dir=cfg.output_dir, file_name='log.txt') # -- typical model components model, opt, scheduler, dataloder --# if net is None: net = get_model_fn(cfg.dataset_name, cfg.network_type) if convbuilder is None: convbuilder = ConvBuilder(base_config=cfg) model = net(cfg, convbuilder).cuda() if train_dataloader is None: train_dataloader = create_dataset(cfg.dataset_name, cfg.dataset_subset, cfg.global_batch_size) if cfg.val_epoch_period > 0 and val_dataloader is None: val_dataloader = create_dataset(cfg.dataset_name, 'val', batch_size=100) #TODO 100? print('NOTE: Data prepared') print( 'NOTE: We have global_batch_size={} on {} GPUs, the allocated GPU memory is {}' .format(cfg.global_batch_size, torch.cuda.device_count(), torch.cuda.memory_allocated())) optimizer = get_optimizer(cfg, model, use_nesterov=use_nesterov) scheduler = get_lr_scheduler(cfg, optimizer) criterion = get_criterion(cfg).cuda() # model, optimizer = amp.initialize(model, optimizer, opt_level="O0") engine.register_state(scheduler=scheduler, model=model, optimizer=optimizer, cfg=cfg) if engine.distributed: print('Distributed training, engine.world_rank={}'.format( engine.world_rank)) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[engine.world_rank], broadcast_buffers=False, ) # model = DistributedDataParallel(model, delay_allreduce=True) elif torch.cuda.device_count() > 1: print('Single machine multiple GPU training') model = torch.nn.parallel.DataParallel(model) if tensorflow_style_init: for k, v in model.named_parameters(): if v.dim() in [2, 4]: torch.nn.init.xavier_uniform_(v) print('init {} as xavier_uniform'.format(k)) if 'bias' in k and 'bn' not in k.lower(): torch.nn.init.zeros_(v) print('init {} as zero'.format(k)) if cfg.init_weights: engine.load_checkpoint(cfg.init_weights) if init_hdf5: engine.load_hdf5(init_hdf5) kernel_namedvalue_list = engine.get_all_conv_kernel_namedvalue_as_list( ) if os.path.exists(clusters_save_path): layer_idx_to_clusters = np.load(clusters_save_path).item() else: layer_idx_to_clusters = get_layer_idx_to_clusters( kernel_namedvalue_list=kernel_namedvalue_list, target_deps=target_deps, pacesetter_dict=pacesetter_dict) if pacesetter_dict is not None: for follower_idx, pacesetter_idx in pacesetter_dict.items(): if pacesetter_idx in layer_idx_to_clusters: layer_idx_to_clusters[ follower_idx] = layer_idx_to_clusters[ pacesetter_idx] np.save(clusters_save_path, layer_idx_to_clusters) csgd_save_file = os.path.join(cfg.output_dir, 'finish.hdf5') if os.path.exists(csgd_save_file): engine.load_hdf5(csgd_save_file) else: param_name_to_merge_matrix = generate_merge_matrix_for_kernel( deps=cfg.deps, layer_idx_to_clusters=layer_idx_to_clusters, kernel_namedvalue_list=kernel_namedvalue_list) param_name_to_decay_matrix = generate_decay_matrix_for_kernel_and_vecs( deps=cfg.deps, layer_idx_to_clusters=layer_idx_to_clusters, kernel_namedvalue_list=kernel_namedvalue_list, weight_decay=cfg.weight_decay, centri_strength=centri_strength) # if pacesetter_dict is not None: # for follower_idx, pacesetter_idx in pacesetter_dict.items(): # follower_kernel_name = kernel_namedvalue_list[follower_idx].name # pacesetter_kernel_name = kernel_namedvalue_list[follower_idx].name # if pacesetter_kernel_name in param_name_to_merge_matrix: # param_name_to_merge_matrix[follower_kernel_name] = param_name_to_merge_matrix[ # pacesetter_kernel_name] # param_name_to_decay_matrix[follower_kernel_name] = param_name_to_decay_matrix[ # pacesetter_kernel_name] add_vecs_to_mat_dicts(param_name_to_merge_matrix) if show_variables: engine.show_variables() if beginning_msg: engine.log(beginning_msg) logger.info("\n\nStart training with pytorch version {}".format( torch.__version__)) iteration = engine.state.iteration # done_epochs = iteration // num_train_examples_per_epoch(cfg.dataset_name) iters_per_epoch = num_iters_per_epoch(cfg) max_iters = iters_per_epoch * cfg.max_epochs tb_writer = SummaryWriter(cfg.tb_dir) tb_tags = ['Top1-Acc', 'Top5-Acc', 'Loss'] model.train() done_epochs = iteration // iters_per_epoch engine.save_hdf5(os.path.join(cfg.output_dir, 'init.hdf5')) recorded_train_time = 0 recorded_train_examples = 0 for epoch in range(done_epochs, cfg.max_epochs): pbar = tqdm(range(iters_per_epoch)) top1 = AvgMeter() top5 = AvgMeter() losses = AvgMeter() discrip_str = 'Epoch-{}/{}'.format(epoch, cfg.max_epochs) pbar.set_description('Train' + discrip_str) if cfg.val_epoch_period > 0 and epoch % cfg.val_epoch_period == 0: model.eval() val_iters = 500 if cfg.dataset_name == 'imagenet' else 100 # use batch_size=100 for val on ImagenNet and CIFAR eval_dict, _ = run_eval(val_dataloader, val_iters, model, criterion, discrip_str, dataset_name=cfg.dataset_name) val_top1_value = eval_dict['top1'].item() val_top5_value = eval_dict['top5'].item() val_loss_value = eval_dict['loss'].item() for tag, value in zip( tb_tags, [val_top1_value, val_top5_value, val_loss_value]): tb_writer.add_scalars(tag, {'Val': value}, iteration) engine.log( 'validate at epoch {}, top1={:.5f}, top5={:.5f}, loss={:.6f}' .format(epoch, val_top1_value, val_top5_value, val_loss_value)) model.train() for _ in pbar: start_time = time.time() data, label = load_cuda_data(train_dataloader, cfg.dataset_name) data_time = time.time() - start_time train_net_time_start = time.time() acc, acc5, loss = train_one_step( model, data, label, optimizer, criterion, param_name_to_merge_matrix=param_name_to_merge_matrix, param_name_to_decay_matrix=param_name_to_decay_matrix) train_net_time_end = time.time() if iteration > TRAIN_SPEED_START * max_iters and iteration < TRAIN_SPEED_END * max_iters: recorded_train_examples += cfg.global_batch_size recorded_train_time += train_net_time_end - train_net_time_start scheduler.step() if iteration % cfg.tb_iter_period == 0 and is_main_process: for tag, value in zip( tb_tags, [acc.item(), acc5.item(), loss.item()]): tb_writer.add_scalars(tag, {'Train': value}, iteration) top1.update(acc.item()) top5.update(acc5.item()) losses.update(loss.item()) pbar_dic = OrderedDict() pbar_dic['data-time'] = '{:.2f}'.format(data_time) pbar_dic['cur_iter'] = iteration pbar_dic['lr'] = scheduler.get_lr()[0] pbar_dic['top1'] = '{:.5f}'.format(top1.mean) pbar_dic['top5'] = '{:.5f}'.format(top5.mean) pbar_dic['loss'] = '{:.5f}'.format(losses.mean) pbar.set_postfix(pbar_dic) if iteration >= max_iters or iteration % cfg.ckpt_iter_period == 0: engine.update_iteration(iteration) if (not engine.distributed) or (engine.distributed and is_main_process): engine.save_and_link_checkpoint(cfg.output_dir) iteration += 1 if iteration >= max_iters: break # do something after an epoch? if iteration >= max_iters: break # do something after the training if recorded_train_time > 0: exp_per_sec = recorded_train_examples / recorded_train_time else: exp_per_sec = 0 engine.log( 'TRAIN speed: from {} to {} iterations, batch_size={}, examples={}, total_net_time={:.4f}, examples/sec={}' .format(int(TRAIN_SPEED_START * max_iters), int(TRAIN_SPEED_END * max_iters), cfg.global_batch_size, recorded_train_examples, recorded_train_time, exp_per_sec)) if cfg.save_weights: engine.save_checkpoint(cfg.save_weights) print('NOTE: training finished, saved to {}'.format( cfg.save_weights)) engine.save_hdf5(os.path.join(cfg.output_dir, 'finish.hdf5')) csgd_prune_and_save(engine=engine, layer_idx_to_clusters=layer_idx_to_clusters, save_file=pruned_weights, succeeding_strategy=succeeding_strategy, new_deps=target_deps)
def train_epoch_prefetch_generator( curr_epoch, end_epoch, loss_funcs, model, optimizer, scheduler, tr_loader, local_rank, ): model.train() train_loss_record = AvgMeter() for train_batch_id, (train_inputs, train_masks, train_names) in enumerate( BackgroundGenerator(tr_loader, max_prefetch=2)): curr_iter = curr_epoch * len(tr_loader) + train_batch_id if user_config["sche_usebatch"]: scheduler.step(optimizer, curr_epoch=curr_iter) train_inputs = train_inputs.cuda(non_blocking=True) train_masks = train_masks.cuda(non_blocking=True) train_preds = model(train_inputs) train_loss, loss_item_list = get_total_loss(train_preds, train_masks, loss_funcs) optimizer.zero_grad() if user_config["use_amp"]: with amp.scale_loss(train_loss, optimizer) as scaled_loss: scaled_loss.backward() else: train_loss.backward() optimizer.step() if user_config["is_distributed"]: reduced_loss = allreduce_tensor(train_loss) else: reduced_loss = train_loss train_iter_loss = reduced_loss.item() train_loss_record.update(train_iter_loss, train_inputs.size(0)) if local_rank == 0: lr_str = ",".join([ f"{param_groups['lr']:.7f}" for param_groups in optimizer.param_groups ]) log = ( f"[I:{train_batch_id}/{len(tr_loader)}/{curr_iter}/{total_iter_num}][E:{curr_epoch}:{end_epoch}]>[" f"{exp_name}]" f"[Lr:{lr_str}][Avg:{train_loss_record.avg:.5f}|Cur:{train_iter_loss:.5f}|" f"{loss_item_list}]\n" f"{train_names}") if user_config["print_freq"] > 0 and ( curr_iter + 1) % user_config["print_freq"] == 0: print(log) if (user_config["record_freq"] > 0 and (curr_iter + 1) % user_config["record_freq"] == 0): tb_recorder.record_curve("trloss_avg", train_loss_record.avg, curr_iter) tb_recorder.record_curve("trloss_iter", train_loss_record.avg, curr_iter) tb_recorder.record_curve("lr", optimizer.param_groups, curr_iter) tb_recorder.record_image("trmasks", train_masks, curr_iter) tb_recorder.record_image("trsodout", train_preds.sigmoid(), curr_iter) tb_recorder.record_image("trsodin", train_inputs, curr_iter) write_data_to_file(log, path_config["tr_log"])
def train(self): for curr_epoch in range(self.start_epoch, self.end_epoch): train_loss_record = AvgMeter() for train_batch_id, train_data in enumerate(self.tr_loader): curr_iter = curr_epoch * len(self.tr_loader) + train_batch_id self.opti.zero_grad() train_inputs, train_masks, *train_other_data = train_data train_inputs = train_inputs.to(self.dev, non_blocking=True) train_masks = train_masks.to(self.dev, non_blocking=True) train_preds = self.net(train_inputs) train_loss, loss_item_list = get_total_loss( train_preds, train_masks, self.loss_funcs) train_loss.backward() self.opti.step() if self.args["sche_usebatch"]: self.sche.step() # 仅在累计的时候使用item()获取数据 train_iter_loss = train_loss.item() train_batch_size = train_inputs.size(0) train_loss_record.update(train_iter_loss, train_batch_size) # 显示tensorboard if self.args["tb_update"] > 0 and ( curr_iter + 1) % self.args["tb_update"] == 0: self.tb.add_scalar("data/trloss_avg", train_loss_record.avg, curr_iter) self.tb.add_scalar("data/trloss_iter", train_iter_loss, curr_iter) for idx, param_groups in enumerate(self.opti.param_groups): self.tb.add_scalar(f"data/lr_{idx}", param_groups["lr"], curr_iter) tr_tb_mask = make_grid(train_masks, nrow=train_batch_size, padding=5) self.tb.add_image("trmasks", tr_tb_mask, curr_iter) tr_tb_out_1 = make_grid(train_preds, nrow=train_batch_size, padding=5) self.tb.add_image("trsodout", tr_tb_out_1, curr_iter) # 记录每一次迭代的数据 if self.args["print_freq"] > 0 and ( curr_iter + 1) % self.args["print_freq"] == 0: lr_str = ",".join([ f"{param_groups['lr']:.7f}" for param_groups in self.opti.param_groups ]) log = ( f"[I:{curr_iter}/{self.iter_num}][E:{curr_epoch}:{self.end_epoch}]>" f"[{self.exp_name}]" f"[Lr:{lr_str}]" f"[Avg:{train_loss_record.avg:.5f}|Cur:{train_iter_loss:.5f}|" f"{loss_item_list}]") print(log) make_log(self.path["tr_log"], log) # 根据周期修改学习率 if not self.args["sche_usebatch"]: self.sche.step() # 每个周期都进行保存测试,保存的是针对第curr_epoch+1周期的参数 save_checkpoint( model=self.net, optimizer=self.opti, scheduler=self.sche, exp_name=self.exp_name, current_epoch=curr_epoch + 1, full_net_path=self.path["final_full_net"], state_net_path=self.path["final_state_net"], ) # 保存参数 total_results = self.test() # save result into xlsx file. write_xlsx(self.exp_name, total_results)
def main(): conf = Conf() args = parse(conf) device = conf.get_device() conf.suppress_random(set_determinism=args.set_determinism) saver = Saver(conf.log_path, args.exp_name) train_loader, query_loader, gallery_loader, queryimg_loader, galleryimg_loader = \ get_dataloaders(args.dataset_name, conf.nas_path, device, args) num_pids = train_loader.dataset.get_num_pids() net = nn.DataParallel(get_model(args, num_pids)) net = net.to(device) saver.write_logs(net.module, vars(args)) opt = Adam(net.parameters(), lr=1e-4, weight_decay=args.wd) milestones = list( range(args.first_milestone, args.num_epochs, args.step_milestone)) scheduler = lr_scheduler.MultiStepLR(opt, milestones=milestones, gamma=args.gamma) triplet_loss = OnlineTripletLoss('soft', True, reduction='mean').to(device) class_loss = nn.CrossEntropyLoss(reduction='mean').to(device) print("EXP_NAME: ", args.exp_name) for e in range(args.num_epochs): if e % args.eval_epoch_interval == 0 and e > 0: ev = Evaluator(net, query_loader, gallery_loader, queryimg_loader, galleryimg_loader, DATA_CONFS[args.dataset_name], device) ev.eval(saver, e, args.verbose) if e % args.save_epoch_interval == 0 and e > 0: saver.save_net(net.module, f'chk_{e // args.save_epoch_interval}') avm = AvgMeter(['triplet', 'class']) for it, (x, y, cams) in enumerate(train_loader): net.train() x, y = x.to(device), y.to(device) opt.zero_grad() embeddings, f_class = net(x, return_logits=True) triplet_loss_batch = triplet_loss(embeddings, y) class_loss_batch = class_loss(f_class, y) loss = triplet_loss_batch + class_loss_batch avm.add([triplet_loss_batch.item(), class_loss_batch.item()]) loss.backward() opt.step() if e % args.print_epoch_interval == 0: stats = avm() str_ = f"Epoch: {e}" for (l, m) in stats: str_ += f" - {l} {m:.2f}" saver.dump_metric_tb(m, e, 'losses', f"avg_{l}") saver.dump_metric_tb(opt.param_groups[0]['lr'], e, 'lr', 'lr') print(str_) scheduler.step() ev = Evaluator(net, query_loader, gallery_loader, queryimg_loader, galleryimg_loader, DATA_CONFS[args.dataset_name], device) ev.eval(saver, e, args.verbose) saver.save_net(net.module, 'chk_end') saver.writer.close()
def run_eval(val_data, max_iters, net, criterion, discrip_str, dataset_name): pbar = tqdm( range(max_iters) ) #Tqdm 是一个快速,可扩展的Python进度条,可以在 Python 长循环中添加一个进度提示信息,用户只需要封装任意的迭代器 tqdm(iterator)。 top1 = AvgMeter() #实例化 top5 = AvgMeter() losses = AvgMeter() pbar.set_description('Validation' + discrip_str) #设置进度条左边显示的信息 total_net_time = 0 with torch.no_grad(): for iter_idx, i in enumerate(pbar): start_time = time.time() data, label = load_cuda_data(val_data, dataset_name=dataset_name) data_time = time.time() - start_time net_time_start = time.time() pred = net(data) net_time_end = time.time() if iter_idx >= SPEED_TEST_SAMPLE_IGNORE_RATIO * max_iters: total_net_time += net_time_end - net_time_start loss = criterion(pred, label) acc, acc5 = torch_accuracy(pred, label, (1, 5)) top1.update(acc.item()) top5.update(acc5.item()) losses.update(loss.item()) pbar_dic = OrderedDict() pbar_dic['data-time'] = '{:.2f}'.format(data_time) pbar_dic['top1'] = '{:.5f}'.format(top1.mean) pbar_dic['top5'] = '{:.5f}'.format(top5.mean) pbar_dic['loss'] = '{:.5f}'.format(losses.mean) pbar.set_postfix(pbar_dic) #设置进度条右边显示的信息 metric_dic = { 'top1': torch.tensor(top1.mean), 'top5': torch.tensor(top5.mean), 'loss': torch.tensor(losses.mean) } # reduced_metirc_dic = reduce_loss_dict(metric_dic) reduced_metirc_dic = metric_dic #TODO note this return reduced_metirc_dic, total_net_time #{top1,top5,loss},网络运行时间
def train(self): for curr_epoch in range(self.start_epoch, self.end_epoch): train_loss_record = AvgMeter() if self.args["lr_type"] == "poly": self.change_lr(curr_epoch) else: raise NotImplementedError for train_batch_id, train_data in enumerate(self.tr_loader): curr_iter = curr_epoch * len(self.tr_loader) + train_batch_id self.opti.zero_grad() train_inputs, train_masks, *train_other_data = train_data train_inputs = train_inputs.to(self.dev, non_blocking=True) train_masks = train_masks.to(self.dev, non_blocking=True) if self.data_mode == "RGBD": # train_other_data是一个list train_depths = train_other_data[-1] train_depths = train_depths.to(self.dev, non_blocking=True) train_preds = self.net(train_inputs, train_depths) elif self.data_mode == "RGB": train_preds = self.net(train_inputs) else: raise NotImplementedError train_loss, loss_item_list = self.total_loss( train_preds, train_masks) train_loss.backward() self.opti.step() # 仅在累计的时候使用item()获取数据 train_iter_loss = train_loss.item() train_batch_size = train_inputs.size(0) train_loss_record.update(train_iter_loss, train_batch_size) # 记录每一次迭代的数据 if self.args["print_freq"] > 0 and ( curr_iter + 1) % self.args["print_freq"] == 0: log = ( f"[I:{curr_iter}/{self.iter_num}][E:{curr_epoch}:{self.end_epoch}]>" f"[{self.model_name}]" f"[Lr:{self.opti.param_groups[0]['lr']:.7f}]" f"[Avg:{train_loss_record.avg:.5f}|Cur:{train_iter_loss:.5f}|" f"{loss_item_list}]") print(log) make_log(self.path["tr_log"], log) # 每个周期都进行保存测试,保存的是针对第curr_epoch+1周期的参数 self.save_checkpoint( curr_epoch + 1, full_net_path=self.path["final_full_net"], state_net_path=self.path["final_state_net"], ) # 进行最终的测试,首先输出验证结果 print(f" ==>> 训练结束 <<== ") for data_name, data_path in self.te_data_list.items(): print(f" ==>> 使用测试集{data_name}测试 <<== ") self.te_loader, self.te_length = create_loader( data_path=data_path, mode="test", get_length=True, data_mode=self.data_mode, ) self.save_path = os.path.join(self.path["save"], data_name) if not os.path.exists(self.save_path): print(f" ==>> {self.save_path} 不存在, 这里创建一个 <<==") os.makedirs(self.save_path) results = self.test(save_pre=self.save_pre) fixed_pre_results = {k: f"{v:.3f}" for k, v in results.items()} msg = f" ==>> 在{data_name}:'{data_path}'测试集上结果\n >> {fixed_pre_results}" print(msg) make_log(self.path["te_log"], msg)