def train_ddp(rank, cfg, return_dict): cfg.gpu = rank + cfg.base_gpu print(f"Train Running basic DDP example on rank {rank}.") setup(rank, cfg.world_size, start_port) cfg.log_file = 'train_{}.txt'.format(cfg.gpu) log_file = os.path.join(cfg.exp_dir, cfg.log_file) logging.config.dictConfig(log_utils.get_logging_dict(log_file, mode='a+')) cfg.logger = logging.getLogger('train') model = net_utils.get_model(cfg) cfg.logger.info('Moving the model to GPU {}'.format(cfg.gpu)) model = net_utils.move_model_to_gpu(cfg, model) cfg.logger.info('Model conv 1 initialization {}'.format( torch.sum(model.backbone.conv1.weight))) if cfg.world_size > 1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) model = DDP(model, device_ids=[cfg.gpu], output_device=cfg.gpu) trn_pretrain.trn(cfg, model) if cfg.gpu == cfg.base_gpu: return_dict['ckpt_path'] = None cleanup()
def train_dense(cfg, generation): model = net_utils.get_model(cfg) if cfg.pretrained and cfg.pretrained != 'imagenet': net_utils.load_pretrained(cfg.pretrained,cfg.gpu, model,cfg) model = net_utils.move_model_to_gpu(cfg, model) net_utils.split_reinitialize(cfg,model,reset_hypothesis=cfg.reset_hypothesis) else: model = net_utils.move_model_to_gpu(cfg, model) cfg.trainer = 'default_cls' # cfg.split_rate = 1.0 # cfg.bias_split_rate = 1.0 cfg.pretrained = None ckpt_path = KE_model.ke_cls_train(cfg, model,generation) return ckpt_path
def train_ddp(rank, cfg, return_dict): cfg.gpu = rank + cfg.base_gpu print(f"Train Running basic DDP example on rank {rank}.") setup(rank, cfg.world_size, start_port) cfg.log_file = 'train_{}.txt'.format(cfg.gpu) log_file = os.path.join(cfg.exp_dir, cfg.log_file) logging.config.dictConfig(log_utils.get_logging_dict(log_file, mode='a+')) cfg.logger = logging.getLogger('train') cfg.logger.info('Getting the model') pretrain_model = net_utils.get_model(cfg) pretrain_model = torch.nn.DataParallel(pretrain_model) if cfg.pretrained and cfg.pretrained != 'imagenet': net_utils.load_pretrained(cfg.pretrained, cfg.gpu, pretrain_model, cfg) classifier_layer = nn.Linear( in_features=pretrain_model.module.backbone.output_dim, out_features=cfg.num_cls, bias=True).cuda() for m in pretrain_model.parameters(): if hasattr(m, "requires_grad") and m.requires_grad is not None: m.requires_grad = False cfg.logger.info('Start Training: Model conv 1 initialization {}'.format( torch.sum(pretrain_model.module.backbone.conv1.weight))) model = nn.Sequential( pretrain_model.module.backbone, classifier_layer, ) cfg.logger.info('Moving the model to GPU {}'.format(cfg.gpu)) model = net_utils.move_model_to_gpu(cfg, model) cfg.logger.info('Model conv 1 initialization {}'.format( torch.sum(model[0].conv1.weight))) if cfg.world_size > 1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) model = DDP(model, device_ids=[cfg.gpu], output_device=cfg.gpu) trn_classifier.trn(cfg, model) if cfg.gpu == cfg.base_gpu: return_dict['ckpt_path'] = None cleanup()
def eval_slim(cfg, generation): original_num_epos = cfg.epochs # cfg.epochs = 0 softmax_criterion = nn.CrossEntropyLoss().cuda() epoch = 1 writer = None model = net_utils.get_model(cfg) net_utils.load_pretrained(cfg.pretrained, cfg.gpu, model,cfg) # if cfg.reset_mask: # net_utils.reset_mask(cfg, model) model = net_utils.move_model_to_gpu(cfg, model) save_filter_stats = (cfg.arch in ['split_alexnet','split_vgg11_bn']) if save_filter_stats: for n, m in model.named_modules(): if hasattr(m, "weight") and m.weight is not None: if hasattr(m, "mask"): layer_mask = m.mask if m.__class__ == conv_type.SplitConv: # filter_state = [''.join(map(str, ((score_mask == True).type(torch.int).squeeze().tolist())))] filter_mag = ['{},{}'.format( float(torch.mean(torch.abs(m.weight[layer_mask.type(torch.bool)]))), float(torch.mean(torch.abs(m.weight[(1-layer_mask).type(torch.bool)])))) ] os_utils.txt_write(osp.join(cfg.exp_dir, n.replace('.', '_') + '_mean_magnitude.txt'), filter_mag, mode='a+') dummy_input_tensor = torch.zeros((1, 3, 224, 224)).cuda() total_ops, total_params = model_profile.profile(model, dummy_input_tensor) cfg.logger.info("Dense #Ops: %f GOps" % (total_ops / 1e9)) cfg.logger.info("Dense #Parameters: %f M" % (total_params / 1e6)) original_split_rate = cfg.split_rate original_bias_split_rate = cfg.bias_split_rate if cfg.split_mode == 'kels': cfg.slim_factor = cfg.split_rate cfg.split_rate = 1.0 cfg.bias_split_rate = 1.0 split_model = net_utils.get_model(cfg) split_model = net_utils.move_model_to_gpu(cfg, split_model) total_ops, total_params = model_profile.profile(split_model, dummy_input_tensor) cfg.logger.info("Split #Ops: %f GOps" % (total_ops / 1e9)) cfg.logger.info("Split #Parameters: %f M" % (total_params / 1e6)) net_utils.extract_slim(split_model, model) dataset = getattr(data, cfg.set)(cfg) train, validate = get_trainer(cfg) last_val_acc1, last_val_acc5 = validate(dataset.tst_loader, split_model, softmax_criterion, cfg, writer, epoch) cfg.logger.info('Split Model : {} , {}'.format(last_val_acc1, last_val_acc5)) else: last_val_acc1 = 0 last_val_acc5 = 0 csv_utils.write_cls_result_to_csv( ## Validation curr_acc1=0, curr_acc5=0, best_acc1=0, best_acc5=0, ## Test last_tst_acc1=last_val_acc1, last_tst_acc5=last_val_acc5, best_tst_acc1=0, best_tst_acc5=0, ## Train best_train_acc1=0, best_train_acc5=0, split_rate='slim', bias_split_rate='slim', base_config=cfg.name, name=cfg.name, ) cfg.epochs = original_num_epos cfg.slim_factor = 1 cfg.split_rate = original_split_rate cfg.bias_split_rate = original_bias_split_rate