def save_checkpoint(self, out_dir, filename_tmpl='epoch_{}.pth', save_optimizer=True, meta=None, create_symlink=True): if meta is None: meta = dict(epoch=self.epoch + 1, iter=self.iter) elif isinstance(meta, dict): meta.update(epoch=self.epoch + 1, iter=self.iter) else: raise TypeError( f'meta should be a dict or None, but got {type(meta)}') if self.meta is not None: meta.update(self.meta) filename = filename_tmpl.format(self.epoch + 1) filepath = osp.join(out_dir, filename) optimizer = self.optimizer if save_optimizer else None save_checkpoint(self.model, filepath, optimizer=optimizer, meta=meta) if create_symlink: dst_file = osp.join(out_dir, 'latest.pth') mmcv.symlink(filename, dst_file)
def test_save_checkpoint(): try: import torch from torch import nn except ImportError: warnings.warn('Skipping test_save_checkpoint in the absense of torch') return import mmcv.runner model = nn.Linear(1, 1) runner = mmcv.runner.Runner(model=model, batch_processor=lambda x: x, logger=logging.getLogger()) with tempfile.TemporaryDirectory() as root: runner.save_checkpoint(root) latest_path = osp.join(root, 'latest.pth') epoch1_path = osp.join(root, 'epoch_1.pth') assert osp.exists(latest_path) assert osp.exists(epoch1_path) assert osp.realpath(latest_path) == osp.realpath(epoch1_path) torch.load(latest_path)
def train(model, training_data, validation_data, optimizer, scheduler, cfg): valid_accus = [] for epoch_i in range(cfg.TRAIN.EPOCHES): print('[ Epoch', epoch_i, ']') start = time.time() train_loss, train_accu = train_epoch(model, training_data, optimizer, ) print(' - (Training) ,loss: {loss:3.3f}, accuracy: {accu:3.3f} %, ' \ 'elapse: {elapse:3.3f} min'.format(loss=train_loss, accu=100 * train_accu, elapse=(time.time() - start) / 60)) vis.log( "Phase:{phase},Epoch:{epoch},AP:{AP},mAP:{mAP},train_loss:{loss}".format( phase="train", epoch=epoch_i, AP=AP.value(), mAP=mAP.value(), loss=Loss_meter.value()[0],)) start = time.time() valid_loss, valid_accu = eval_epoch(model, validation_data, ) print(' - (Validation) ,loss: {loss:3.3f}, accuracy: {accu:3.3f} %, ' \ 'elapse: {elapse:3.3f} min'.format(loss=valid_loss, accu=100 * valid_accu, elapse=(time.time() - start) / 60)) scheduler.step(valid_loss) # 更新学习率 vis.plot_many_stack({'val loss': valid_loss,'train loss': train_loss}) vis.plot_many_stack({'val accuracy': valid_accu,'train accuracy': train_accu}) vis.log( "Phase:{phase},Epoch:{epoch}, AP:{AP},mAP:{mAP},val_loss:{loss}".format( phase="validation", epoch=epoch_i, AP=AP.value(), mAP=mAP.value(), loss=Loss_meter.value()[0], )) valid_accus += [valid_accu] # 存储了所有epoch的准确率 if valid_accu >= max(valid_accus): save_checkpoint(model, cfg.MODEL.SAVE_IN+cfg.MODEL.NAME+'.pth')
def main(): args = parse_args() # build the model from a config file and a checkpoint file model = init_recognizer(args.config, args.checkpoint) # fuse conv and bn layers of the model fused_model = fuse_module(model) save_checkpoint(fused_model, args.out)
def main(): args = parse_args() # build the model from a config file and a checkpoint file model = init_detector(args.config, args.checkpoint) # fuse conv and bn layers of the model fused_model = fuse_module(model) save_checkpoint(fused_model, args.out, create_symlink=False)
def save_ckpt(cls_head): save_checkpoint( cls_head, './{}/{}_{}.pth'.format(cfg.work_dir, exp_prefix, use_model)) torch.save( epoch, './{}/{}_{}_epoch.pth'.format(cfg.work_dir, exp_prefix, use_model))
def test_init_weights(self): # test weight init cfg cfg = deepcopy(self.cfg) cfg['init_cfg'] = [dict(type='TruncNormal', layer='Linear', std=.02)] model = T2T_ViT(**cfg) ori_weight = model.tokens_to_token.project.weight.clone().detach() model.init_weights() initialized_weight = model.tokens_to_token.project.weight self.assertFalse(torch.allclose(ori_weight, initialized_weight)) # test load checkpoint pretrain_pos_embed = model.pos_embed.clone().detach() tmpdir = tempfile.gettempdir() checkpoint = os.path.join(tmpdir, 'test.pth') save_checkpoint(model, checkpoint) cfg = deepcopy(self.cfg) model = T2T_ViT(**cfg) load_checkpoint(model, checkpoint, strict=True) self.assertTrue(torch.allclose(model.pos_embed, pretrain_pos_embed)) # test load checkpoint with different img_size cfg = deepcopy(self.cfg) cfg['img_size'] = 384 model = T2T_ViT(**cfg) load_checkpoint(model, checkpoint, strict=True) resized_pos_embed = timm_resize_pos_embed(pretrain_pos_embed, model.pos_embed) self.assertTrue(torch.allclose(model.pos_embed, resized_pos_embed)) os.remove(checkpoint)
def after_train_epoch(self, runner): """Called after every training epoch to evaluate the results.""" if not self.every_n_epochs(runner, self.interval): return current_ckpt_path = osp.join(runner.work_dir, f'epoch_{runner.epoch + 1}.pth') json_path = osp.join(runner.work_dir, 'best.json') if osp.exists(json_path) and len(self.best_json) == 0: self.best_json = mmcv.load(json_path) self.best_score = self.best_json['best_score'] self.best_ckpt = self.best_json['best_ckpt'] self.key_indicator = self.best_json['key_indicator'] from mmpose.apis import single_gpu_test results = single_gpu_test(runner.model, self.dataloader) key_score = self.evaluate(runner, results) if (self.save_best and self.compare_func(key_score, self.best_score)): self.best_score = key_score self.logger.info( f'Now best checkpoint is epoch_{runner.epoch + 1}.pth') self.best_json['best_score'] = self.best_score self.best_json['best_ckpt'] = current_ckpt_path self.best_json['key_indicator'] = self.key_indicator save_checkpoint(runner.model, osp.join(runner.work_dir, 'best.pth')) mmcv.dump(self.best_json, json_path)
def test_load_classes_name(): import os import tempfile from mmcv.runner import load_checkpoint, save_checkpoint checkpoint_path = os.path.join(tempfile.gettempdir(), 'checkpoint.pth') model = Model() save_checkpoint(model, checkpoint_path) checkpoint = load_checkpoint(model, checkpoint_path) assert 'meta' in checkpoint and 'CLASSES' not in checkpoint['meta'] model.CLASSES = ('class1', 'class2') save_checkpoint(model, checkpoint_path) checkpoint = load_checkpoint(model, checkpoint_path) assert 'meta' in checkpoint and 'CLASSES' in checkpoint['meta'] assert checkpoint['meta']['CLASSES'] == ('class1', 'class2') model = Model() wrapped_model = DDPWrapper(model) save_checkpoint(wrapped_model, checkpoint_path) checkpoint = load_checkpoint(wrapped_model, checkpoint_path) assert 'meta' in checkpoint and 'CLASSES' not in checkpoint['meta'] wrapped_model.module.CLASSES = ('class1', 'class2') save_checkpoint(wrapped_model, checkpoint_path) checkpoint = load_checkpoint(wrapped_model, checkpoint_path) assert 'meta' in checkpoint and 'CLASSES' in checkpoint['meta'] assert checkpoint['meta']['CLASSES'] == ('class1', 'class2') # remove the temp file os.remove(checkpoint_path)
def save_icevision_checkpoint( model, filename, optimizer=None, meta=None, model_name=None, backbone_name=None, classes=None, img_size=None, ): """Save checkpoint to file. The checkpoint will have 3 fields: ``meta``, ``state_dict`` and ``optimizer``. By default ``meta`` will contain version and time info. Args: model (Module): Module whose params are to be saved. filename (str): Checkpoint filename. optimizer (:obj:`Optimizer`, optional): Optimizer to be saved. meta (dict, optional): Metadata to be saved in checkpoint. Examples: >>> save_icevision_checkpoint(model_saved, model_name='mmdet.retinanet', backbone_name='resnet50_fpn_1x', class_map = class_map, img_size=img_size, filename=checkpoint_path, meta={'icevision_version': '0.9.1'}) """ if meta is None: meta = {} elif not isinstance(meta, dict): raise TypeError(f"meta must be a dict or None, but got {type(meta)}") if classes: meta.update(classes=classes) if model_name: meta.update(model_name=model_name) if img_size: meta.update(img_size=img_size) if backbone_name: meta.update(backbone_name=backbone_name) save_checkpoint(model, filename, optimizer=optimizer, meta=meta)
def test_repvgg_load(): # Test ouput before and load from deploy checkpoint model = RepVGG('A1', out_indices=(0, 1, 2, 3)) inputs = torch.randn((1, 3, 224, 224)) ckpt_path = os.path.join(tempfile.gettempdir(), 'ckpt.pth') model.switch_to_deploy() model.eval() outputs = model(inputs) model_deploy = RepVGG('A1', out_indices=(0, 1, 2, 3), deploy=True) save_checkpoint(model, ckpt_path) load_checkpoint(model_deploy, ckpt_path, strict=True) outputs_load = model_deploy(inputs) for feat, feat_load in zip(outputs, outputs_load): assert torch.allclose(feat, feat_load)
def test_load_checkpoint(): model = SwinTransformer(arch='tiny') ckpt_path = os.path.join(tempfile.gettempdir(), 'ckpt.pth') assert model._version == 2 # test load v2 checkpoint save_checkpoint(model, ckpt_path) load_checkpoint(model, ckpt_path, strict=True) # test load v1 checkpoint setattr(model, 'norm', model.norm3) model._version = 1 del model.norm3 save_checkpoint(model, ckpt_path) model = SwinTransformer(arch='tiny') load_checkpoint(model, ckpt_path, strict=True)
def save_checkpoint(self, out_dir, filename_tmpl='epoch_{}.pth', save_optimizer=True, meta=None, create_symlink=True): """Save the checkpoint. Args: out_dir (str): The directory that checkpoints are saved. filename_tmpl (str, optional): The checkpoint filename template, which contains a placeholder for the epoch number. Defaults to 'epoch_{}.pth'. save_optimizer (bool, optional): Whether to save the optimizer to the checkpoint. Defaults to True. meta (dict, optional): The meta information to be saved in the checkpoint. Defaults to None. create_symlink (bool, optional): Whether to create a symlink "latest.pth" to point to the latest checkpoint. Defaults to True. """ if meta is None: meta = dict(epoch=self.epoch + 1, iter=self.iter) elif isinstance(meta, dict): meta.update(epoch=self.epoch + 1, iter=self.iter) else: raise TypeError( f'meta should be a dict or None, but got {type(meta)}') if self.meta is not None: meta.update(self.meta) filename = filename_tmpl.format(self.epoch + 1) filepath = osp.join(out_dir, filename) optimizer = self.optimizer if save_optimizer else None save_checkpoint(self.model, filepath, optimizer=optimizer, meta=meta) # in some environments, `os.symlink` is not supported, you may need to # set `create_symlink` to False if create_symlink: dst_file = osp.join(out_dir, 'latest.pth') if platform.system() != 'Windows': mmcv.symlink(filename, dst_file) else: shutil.copy(filename, dst_file)
def test_init_weights(self): # test weight init cfg cfg = deepcopy(self.cfg) cfg['init_cfg'] = [ dict( type='Kaiming', layer='Conv2d', mode='fan_in', nonlinearity='linear') ] model = DistilledVisionTransformer(**cfg) ori_weight = model.patch_embed.projection.weight.clone().detach() # The pos_embed is all zero before initialize self.assertTrue(torch.allclose(model.dist_token, torch.tensor(0.))) model.init_weights() initialized_weight = model.patch_embed.projection.weight self.assertFalse(torch.allclose(ori_weight, initialized_weight)) self.assertFalse(torch.allclose(model.dist_token, torch.tensor(0.))) # test load checkpoint pretrain_pos_embed = model.pos_embed.clone().detach() tmpdir = tempfile.gettempdir() checkpoint = os.path.join(tmpdir, 'test.pth') save_checkpoint(model, checkpoint) cfg = deepcopy(self.cfg) model = DistilledVisionTransformer(**cfg) load_checkpoint(model, checkpoint, strict=True) self.assertTrue(torch.allclose(model.pos_embed, pretrain_pos_embed)) # test load checkpoint with different img_size cfg = deepcopy(self.cfg) cfg['img_size'] = 384 model = DistilledVisionTransformer(**cfg) load_checkpoint(model, checkpoint, strict=True) resized_pos_embed = timm_resize_pos_embed( pretrain_pos_embed, model.pos_embed, num_tokens=2) self.assertTrue(torch.allclose(model.pos_embed, resized_pos_embed)) os.remove(checkpoint)
def test_deploy_(self): # Test output before and load from deploy checkpoint imgs = torch.randn((1, 3, 224, 224)) cfg = dict(arch='b', out_indices=( 1, 3, ), reparam_conv_kernels=(1, 3, 5)) model = RepMLPNet(**cfg) model.eval() feats = model(imgs) model.switch_to_deploy() for m in model.modules(): if hasattr(m, 'deploy'): self.assertTrue(m.deploy) model.eval() feats_ = model(imgs) assert len(feats) == len(feats_) for i in range(len(feats)): self.assertTrue( torch.allclose(feats[i].sum(), feats_[i].sum(), rtol=0.1, atol=0.1)) cfg['deploy'] = True model_deploy = RepMLPNet(**cfg) model_deploy.eval() save_checkpoint(model, self.ckpt_path) load_checkpoint(model_deploy, self.ckpt_path, strict=True) feats__ = model_deploy(imgs) assert len(feats_) == len(feats__) for i in range(len(feats)): self.assertTrue(torch.allclose(feats__[i], feats_[i]))
def main(): # base configs data_root = '/media/' + getpass.getuser( ) + '/Data/DoubleCircle/datasets/kaist-rgbt-encoder/' # img_norm_cfg = dict( # mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) # img_norm_cfg_t = dict( # mean=[123.675, 123.675, 123.675], std=[58.395, 58.395, 58.395], to_rgb=False) img_norm_cfg = dict(mean=[0, 0, 0], std=[255, 255, 255], to_rgb=True) img_norm_cfg_t = dict(mean=[0, 0, 0], std=[147, 147, 147], to_rgb=False) imgs_per_gpu = 16 workers_per_gpu = 2 max_epoch = 50 base_lr = 1e-2 # train and test dataset train = dict(ann_file=data_root + 'annotations-pkl/train-all.pkl', img_prefix=data_root + 'images/', img_scale=1.0, img_norm_cfg=img_norm_cfg, img_norm_cfg_t=img_norm_cfg_t, size_divisor=None, flip_ratio=0.5, with_mask=False, with_crowd=True, with_label=True) test = dict(ann_file=data_root + 'annotations-pkl/test-all-rgb.pkl', img_prefix=data_root + 'images/', img_scale=1.0, img_norm_cfg=img_norm_cfg, img_norm_cfg_t=img_norm_cfg_t, size_divisor=None, flip_ratio=0, with_mask=False, with_crowd=True, with_label=True) dataset_train = CoderKaistDataset(**train) dataset_test = CoderKaistDataset(**test) # train and test data loader data_loaders_train = build_dataloader(dataset_train, imgs_per_gpu, workers_per_gpu, num_gpus=1, dist=False) data_loaders_test = build_dataloader(dataset_test, imgs_per_gpu, workers_per_gpu, num_gpus=1, dist=False) # MINST dataset # im_tfs = tfs.Compose([ # tfs.ToTensor(), # tfs.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]) # 标准化 # ]) # # train_set = MNIST('./mnist', transform=im_tfs, download=True) # train_data = DataLoader(train_set, batch_size=128, shuffle=True) # train net = AutoEncoder() net.init_weights() net.cuda() # loss_fn = torch.nn.MSELoss(size_average=False) loss_fn = torch.nn.MSELoss(reduction='elementwise_mean') optimizer = optim.Adam(net.parameters(), lr=base_lr, weight_decay=0.0001) print('Start training...\n') # for e in range(max_epoch): # for im in data_loaders_train: # if torch.cuda.is_available(): # input = im['img_thermal_in'].cuda() # input = Variable(input) # # 前向传播 # code, output = net(input) # loss = loss_fn(output, input) # # 反向传播 # optimizer.zero_grad() # loss.backward() # optimizer.step() # # if (e + 1) % 1 == 0: # 每 1 次,将生成的图片保存一下 # print('epoch: {}, Loss: {:.4f}'.format(e + 1, loss.data)) # output = output.cpu().data # target = im['img_thermal_in'] # pic = np.zeros((output.shape[0], output.shape[2], output.shape[3], output.shape[1]), dtype=np.uint8) # target_pic = np.zeros((output.shape[0], output.shape[2], output.shape[3], output.shape[1]), # dtype=np.uint8) # mean = np.array(img_norm_cfg['mean'], dtype=np.float32) # std = np.array(img_norm_cfg['std'], dtype=np.float32) # for idx in range(imgs_per_gpu): # img = output[idx, ...].numpy().transpose(1, 2, 0).astype(np.float32) # pic[idx, :, :, :] = mmcv.imdenormalize( # img, mean=mean, std=std, to_bgr=False).astype(np.uint8) # target_img = target[idx, ...].numpy().transpose(1, 2, 0).astype(np.float32) # target_pic[idx, :, :, :] = mmcv.imdenormalize( # target_img, mean=mean, std=std, to_bgr=False).astype(np.uint8) # if not os.path.exists('../../work_dirs/autoencoder'): # os.mkdir('../../work_dirs/autoencoder') # save_images(torch.from_numpy(pic.transpose((0, 3, 1, 2))), # '../../work_dirs/autoencoder/image_{}.png'.format(e + 1)) # save_images(torch.from_numpy(target_pic.transpose(0, 3, 1, 2)), # '../../work_dirs/autoencoder/target_image_{}.png'.format(e + 1)) # # update learn rate # adjust_learning_rate(optimizer, base_lr, e) # # save checkpoint # filename = '../../work_dirs/autoencoder/epoch_{}.pth'.format(e + 1) # save_checkpoint(net, filename=filename) iter_epoch = len(data_loaders_train) for e in range(max_epoch): # training phase net.train() loss_iter = 0.0 for i, data_batch in enumerate(data_loaders_train): code, decode_rgb, decode_thermal = net( data_batch['img_rgb_in'].cuda(), data_batch['img_thermal_in'].cuda()) data_batch['img_rgb_out'] = data_batch['img_rgb_out'].view( (-1, 3, 128, 160)) data_batch['img_thermal_out'] = data_batch['img_thermal_out'].view( (-1, 3, 128, 160)) loss_rgb = loss_fn(decode_rgb.cpu(), data_batch['img_rgb_out']) loss_thermal = loss_fn(decode_thermal.cpu(), data_batch['img_thermal_out']) loss = loss_rgb + loss_thermal loss_iter += loss optimizer.zero_grad() loss.backward() optimizer.step() if (i + 1) % 50 == 0: print( 'Epoch:{}|{},Iteration:[{}|{}],Learning Rate:{},Loss:{:.4f}' .format(e + 1, max_epoch, i + 1, len(data_loaders_train), optimizer.param_groups[0]['lr'], loss_iter)) loss_iter = 0.0 # update learn rate adjust_learning_rate(optimizer, base_lr, e + 1) # save checkpoint filename = '../../work_dirs/autoencoder/epoch_{}.pth'.format(e + 1) save_checkpoint(net, filename=filename) # evaluation phase if (e + 1) % 1 == 0: # 每 1 次,将生成的图片保存一下 output_rgb = decode_rgb.cpu().data target_rgb = data_batch['img_rgb_out'] output_thermal = decode_thermal.cpu().data tartget_thermal = data_batch['img_thermal_out'] pic_rgb = np.zeros((output_rgb.shape[0], output_rgb.shape[2], output_rgb.shape[3], output_rgb.shape[1]), dtype=np.uint8) target_pic_rgb = np.zeros( (output_rgb.shape[0], output_rgb.shape[2], output_rgb.shape[3], output_rgb.shape[1]), dtype=np.uint8) pic_thermal = np.zeros((output_rgb.shape[0], output_rgb.shape[2], output_rgb.shape[3], output_rgb.shape[1]), dtype=np.uint8) target_pic_thermal = np.zeros( (output_rgb.shape[0], output_rgb.shape[2], output_rgb.shape[3], output_rgb.shape[1]), dtype=np.uint8) mean_rgb = np.array(img_norm_cfg['mean'], dtype=np.float32) std_rgb = np.array(img_norm_cfg['std'], dtype=np.float32) mean_thermal = np.array(img_norm_cfg_t['mean'], dtype=np.float32) std_thermal = np.array(img_norm_cfg_t['std'], dtype=np.float32) for idx in range(output_rgb.shape[0]): # for rgb img = output_rgb[idx, ...].numpy().transpose(1, 2, 0).astype(np.float32) pic_rgb[idx, :, :, :] = mmcv.imdenormalize( img, mean=mean_rgb, std=std_rgb, to_bgr=False).astype(np.uint8) target_img = target_rgb[idx, ...].numpy().transpose( 1, 2, 0).astype(np.float32) target_pic_rgb[idx, :, :, :] = mmcv.imdenormalize( target_img, mean=mean_rgb, std=std_rgb, to_bgr=False).astype(np.uint8) # for thermal img_t = output_thermal[idx, ...].numpy().transpose( 1, 2, 0).astype(np.float32) pic_thermal[idx, :, :, :] = mmcv.imdenormalize( img_t, mean=mean_thermal, std=std_thermal, to_bgr=False).astype(np.uint8) target_img_t = tartget_thermal[idx, ...].numpy().transpose( 1, 2, 0).astype(np.float32) target_pic_thermal[idx, :, :, :] = mmcv.imdenormalize( target_img_t, mean=mean_thermal, std=std_thermal, to_bgr=False).astype(np.uint8) if not os.path.exists('../../work_dirs/autoencoder'): os.mkdir('../../work_dirs/autoencoder') save_images( torch.from_numpy(pic_rgb.transpose((0, 3, 1, 2))), '../../work_dirs/autoencoder/image_rgb_{}.png'.format(e + 1)) save_images( torch.from_numpy(target_pic_rgb.transpose(0, 3, 1, 2)), '../../work_dirs/autoencoder/target_image_rgb_{}.png'.format( e + 1)) save_images( torch.from_numpy(pic_thermal.transpose((0, 3, 1, 2))), '../../work_dirs/autoencoder/image_thermal_{}.png'.format(e + 1)) save_images( torch.from_numpy(target_pic_thermal.transpose(0, 3, 1, 2)), '../../work_dirs/autoencoder/target_image_thermal_{}.png'. format(e + 1))
def main(): args = parse_args() cfg = Config.fromfile(args.cfg) work_dir = cfg.work_dir os.environ["CUDA_VISIBLE_DEVICES"] = ",".join( str(device_id) for device_id in cfg.device_ids) log_dir = os.path.join(work_dir, 'logs') if not os.path.exists(log_dir): os.makedirs(log_dir) logger = init_logger(log_dir) seed = cfg.seed logger.info('Set random seed to {}'.format(seed)) set_random_seed(seed) train_dataset = get_dataset(cfg.data.train) train_data_loader = build_dataloader( train_dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, len(cfg.device_ids), dist=False, ) val_dataset = get_dataset(cfg.data.val) val_data_loader = build_dataloader(val_dataset, 1, cfg.data.workers_per_gpu, 1, dist=False, shuffle=False) model = build_detector(cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg) model = MMDataParallel(model).cuda() optimizer = obj_from_dict(cfg.optimizer, torch.optim, dict(params=model.parameters())) lr_scheduler = obj_from_dict(cfg.lr_scedule, LRschedule, dict(optimizer=optimizer)) checkpoint_dir = os.path.join(cfg.work_dir, 'checkpoint_dir') if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) start_epoch = cfg.start_epoch if cfg.resume_from: checkpoint = load_checkpoint(model, cfg.resume_from) start_epoch = 0 logger.info('resumed epoch {}, from {}'.format(start_epoch, cfg.resume_from)) log_buffer = LogBuffer() for epoch in range(start_epoch, cfg.end_epoch): train(train_data_loader, model, optimizer, epoch, lr_scheduler, log_buffer, cfg, logger) tmp_checkpoint_file = os.path.join(checkpoint_dir, 'tmp_val.pth') meta_dict = cfg._cfg_dict logger.info('save tmp checkpoint to {}'.format(tmp_checkpoint_file)) save_checkpoint(model, tmp_checkpoint_file, optimizer, meta=meta_dict) if len(cfg.device_ids) == 1: sensitivity = val(val_data_loader, model, cfg, logger, epoch) else: model_args = cfg.model.copy() model_args.update(train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg) model_type = getattr(detectors, model_args.pop('type')) results = parallel_test( cfg, model_type, model_args, tmp_checkpoint_file, val_dataset, np.arange(len(cfg.device_ids)).tolist(), workers_per_gpu=1, ) sensitivity = evaluate_deep_lesion(results, val_dataset, cfg.cfg_3dce, logger) save_file = os.path.join( checkpoint_dir, 'epoch_{}_sens@4FP_{:.5f}_{}.pth'.format( epoch + 1, sensitivity, time.strftime('%m-%d-%H-%M', time.localtime(time.time())))) os.rename(tmp_checkpoint_file, save_file) logger.info('save checkpoint to {}'.format(save_file)) if epoch > cfg.lr_scedule.T_max: os.remove(save_file)
def test_checkpoint_loader(): from mmcv.runner import _load_checkpoint, save_checkpoint, CheckpointLoader import tempfile import os checkpoint_path = os.path.join(tempfile.gettempdir(), 'checkpoint.pth') model = Model() save_checkpoint(model, checkpoint_path) checkpoint = _load_checkpoint(checkpoint_path) assert 'meta' in checkpoint and 'CLASSES' not in checkpoint['meta'] # remove the temp file os.remove(checkpoint_path) filenames = [ 'http://xx.xx/xx.pth', 'https://xx.xx/xx.pth', 'modelzoo://xx.xx/xx.pth', 'torchvision://xx.xx/xx.pth', 'open-mmlab://xx.xx/xx.pth', 'openmmlab://xx.xx/xx.pth', 'mmcls://xx.xx/xx.pth', 'pavi://xx.xx/xx.pth', 's3://xx.xx/xx.pth', 'ss3://xx.xx/xx.pth', ' s3://xx.xx/xx.pth' ] fn_names = [ 'load_from_http', 'load_from_http', 'load_from_torchvision', 'load_from_torchvision', 'load_from_openmmlab', 'load_from_openmmlab', 'load_from_mmcls', 'load_from_pavi', 'load_from_ceph', 'load_from_local', 'load_from_local' ] for filename, fn_name in zip(filenames, fn_names): loader = CheckpointLoader._get_checkpoint_loader(filename) assert loader.__name__ == fn_name @CheckpointLoader.register_scheme(prefixes='ftp://') def load_from_ftp(filename, map_location): return dict(filename=filename) # test register_loader filename = 'ftp://xx.xx/xx.pth' loader = CheckpointLoader._get_checkpoint_loader(filename) assert loader.__name__ == 'load_from_ftp' def load_from_ftp1(filename, map_location): return dict(filename=filename) # test duplicate registered error with pytest.raises(KeyError): CheckpointLoader.register_scheme('ftp://', load_from_ftp1) # test force param CheckpointLoader.register_scheme('ftp://', load_from_ftp1, force=True) checkpoint = CheckpointLoader.load_checkpoint(filename) assert checkpoint['filename'] == filename # test print function name loader = CheckpointLoader._get_checkpoint_loader(filename) assert loader.__name__ == 'load_from_ftp1' # test sort @CheckpointLoader.register_scheme(prefixes='a/b') def load_from_ab(filename, map_location): return dict(filename=filename) @CheckpointLoader.register_scheme(prefixes='a/b/c') def load_from_abc(filename, map_location): return dict(filename=filename) filename = 'a/b/c/d' loader = CheckpointLoader._get_checkpoint_loader(filename) assert loader.__name__ == 'load_from_abc'
def test_load_checkpoint_metadata(): import os import tempfile from mmcv.runner import load_checkpoint, save_checkpoint class ModelV1(nn.Module): def __init__(self): super().__init__() self.block = Block() self.conv1 = nn.Conv2d(3, 3, 1) self.conv2 = nn.Conv2d(3, 3, 1) nn.init.normal_(self.conv1.weight) nn.init.normal_(self.conv2.weight) class ModelV2(nn.Module): _version = 2 def __init__(self): super().__init__() self.block = Block() self.conv0 = nn.Conv2d(3, 3, 1) self.conv1 = nn.Conv2d(3, 3, 1) nn.init.normal_(self.conv0.weight) nn.init.normal_(self.conv1.weight) def _load_from_state_dict(self, state_dict, prefix, local_metadata, *args, **kwargs): """load checkpoints.""" # Names of some parameters in has been changed. version = local_metadata.get('version', None) if version is None or version < 2: state_dict_keys = list(state_dict.keys()) convert_map = {'conv1': 'conv0', 'conv2': 'conv1'} for k in state_dict_keys: for ori_str, new_str in convert_map.items(): if k.startswith(prefix + ori_str): new_key = k.replace(ori_str, new_str) state_dict[new_key] = state_dict[k] del state_dict[k] super()._load_from_state_dict(state_dict, prefix, local_metadata, *args, **kwargs) model_v1 = ModelV1() model_v1_conv0_weight = model_v1.conv1.weight.detach() model_v1_conv1_weight = model_v1.conv2.weight.detach() model_v2 = ModelV2() model_v2_conv0_weight = model_v2.conv0.weight.detach() model_v2_conv1_weight = model_v2.conv1.weight.detach() ckpt_v1_path = os.path.join(tempfile.gettempdir(), 'checkpoint_v1.pth') ckpt_v2_path = os.path.join(tempfile.gettempdir(), 'checkpoint_v2.pth') # Save checkpoint save_checkpoint(model_v1, ckpt_v1_path) save_checkpoint(model_v2, ckpt_v2_path) # test load v1 model load_checkpoint(model_v2, ckpt_v1_path) assert torch.allclose(model_v2.conv0.weight, model_v1_conv0_weight) assert torch.allclose(model_v2.conv1.weight, model_v1_conv1_weight) # test load v2 model load_checkpoint(model_v2, ckpt_v2_path) assert torch.allclose(model_v2.conv0.weight, model_v2_conv0_weight) assert torch.allclose(model_v2.conv1.weight, model_v2_conv1_weight)
object. checkpoint (str, optional): Checkpoint path. If left as None, the model will not load any weights. Returns: nn.Module: The constructed detector. """ if isinstance(config, str): config = mmcv.Config.fromfile(config) elif not isinstance(config, mmcv.Config): raise TypeError('config must be a filename or Config object, ' f'but got {type(config)}') config.model.pretrained = None model = build_detector(config.model, test_cfg=config.test_cfg) if checkpoint is not None: map_loc = 'cpu' if device == 'cpu' else None checkpoint = load_checkpoint(model, checkpoint, map_location=map_loc) if 'CLASSES' in checkpoint['meta']: model.CLASSES = checkpoint['meta']['CLASSES'] else: model.CLASSES = get_classes('coco') model.eval() return model pth = 'your_checkpoint.pth' model = init_detector('configs/gfl/gfl_r18_fpn_1x_coco.py ',pth) # corresponding config file save_checkpoint(model, pth)
def after_run(self, runner): model = runner.model.module model.bbox_head.teacher_model = None save_checkpoint(model, 'final.pth')
def train(model, baseline, training_data, validation_data, optimizer, scheduler, cfg, args): valid_accus = [] for epoch_i in range(cfg.TRAIN.EPOCHES): print('[ Epoch', epoch_i, ']') start = time.time() train_loss, train_accu = train_epoch( model, baseline, training_data, optimizer, ) print(' - (Training) ,loss: {loss:3.3f}, accuracy: {accu:3.3f} %, ' \ 'elapse: {elapse:3.3f} min'.format(loss=train_loss, accu=100 * train_accu, elapse=(time.time() - start) / 60)) vis.log( "Phase:{phase},Epoch:{epoch},AP:{AP},mAP:{mAP},train_loss:{loss}". format( phase="train", epoch=epoch_i, AP=AP.value(), mAP=mAP.value(), loss=Loss_meter.value()[0], )) start = time.time() #if epoch_i %3 ==0: valid_loss, valid_accu = eval_epoch( model, baseline, validation_data, ) vis.log( "Phase:{phase},Epoch:{epoch}, AP:{AP},mAP:{mAP},val_loss:{loss}". format( phase="validation", epoch=epoch_i, AP=AP.value(), mAP=mAP.value(), loss=Loss_meter.value()[0], )) #scheduler.step(valid_loss) # 更新学习率 #if epoch_i == 4: # for param_group in optimizer.param_groups: # param_group['lr'] = 1e-4 print(' - (Validation) ,loss: {loss:3.3f}, accuracy: {accu:3.3f} %, ' \ 'elapse: {elapse:3.3f} min'.format(loss=valid_loss, accu=100 * valid_accu, elapse=(time.time() - start) / 60)) vis.plot_many_stack({ 'val accuracy': valid_accu, 'train accuracy': train_accu }) vis.plot_many_stack({'val loss': valid_loss, 'train loss': train_loss}) if torch.cuda.current_device() == 0: valid_accus += [valid_accu] # 存储了所有epoch的准确率 if valid_accu >= max(valid_accus): save_checkpoint( model, cfg.MODEL.SAVE_IN + cfg.MODEL.NAME + '_gpu0' + '.pth') if torch.cuda.current_device() == 1: valid_accus += [valid_accu] # 存储了所有epoch的准确率 if valid_accu >= max(valid_accus): save_checkpoint( model, cfg.MODEL.SAVE_IN + cfg.MODEL.NAME + '_gpu1' + '.pth') if torch.cuda.current_device() == 2: valid_accus += [valid_accu] # 存储了所有epoch的准确率 if valid_accu >= max(valid_accus): save_checkpoint( model, cfg.MODEL.SAVE_IN + cfg.MODEL.NAME + '_gpu2' + '.pth')
def test_init_weights(self): # test weight init cfg cfg = deepcopy(self.cfg) cfg['use_abs_pos_embed'] = True cfg['init_cfg'] = [ dict( type='Kaiming', layer='Conv2d', mode='fan_in', nonlinearity='linear') ] model = SwinTransformer(**cfg) ori_weight = model.patch_embed.projection.weight.clone().detach() # The pos_embed is all zero before initialize self.assertTrue( torch.allclose(model.absolute_pos_embed, torch.tensor(0.))) model.init_weights() initialized_weight = model.patch_embed.projection.weight self.assertFalse(torch.allclose(ori_weight, initialized_weight)) self.assertFalse( torch.allclose(model.absolute_pos_embed, torch.tensor(0.))) pretrain_pos_embed = model.absolute_pos_embed.clone().detach() tmpdir = tempfile.gettempdir() # Save v3 checkpoints checkpoint_v2 = os.path.join(tmpdir, 'v3.pth') save_checkpoint(model, checkpoint_v2) # Save v1 checkpoints setattr(model, 'norm', model.norm3) setattr(model.stages[0].blocks[1].attn, 'attn_mask', torch.zeros(64, 49, 49)) model._version = 1 del model.norm3 checkpoint_v1 = os.path.join(tmpdir, 'v1.pth') save_checkpoint(model, checkpoint_v1) # test load v1 checkpoint cfg = deepcopy(self.cfg) cfg['use_abs_pos_embed'] = True model = SwinTransformer(**cfg) load_checkpoint(model, checkpoint_v1, strict=True) # test load v3 checkpoint cfg = deepcopy(self.cfg) cfg['use_abs_pos_embed'] = True model = SwinTransformer(**cfg) load_checkpoint(model, checkpoint_v2, strict=True) # test load v3 checkpoint with different img_size cfg = deepcopy(self.cfg) cfg['img_size'] = 384 cfg['use_abs_pos_embed'] = True model = SwinTransformer(**cfg) load_checkpoint(model, checkpoint_v2, strict=True) resized_pos_embed = timm_resize_pos_embed( pretrain_pos_embed, model.absolute_pos_embed, num_tokens=0) self.assertTrue( torch.allclose(model.absolute_pos_embed, resized_pos_embed)) os.remove(checkpoint_v1) os.remove(checkpoint_v2)
new_config_path = "/data/modules/mmdetection/configs/dcn/cascade_rcnn_dconv_7cls.py" new_checkpoint_path = "/data/modules/mmdetection/work_dirs/cascade_rcnn_dconv_7cls/baseline.pth" old_model = init_detector(old_config_path, old_checkpoint_path, device='cuda:0') # old_detector = Detector(old_config_path, old_checkpoint_path, class_restrictions=None) # old_model = old_detector.model # new_detector = Detector(new_config_path, None, class_restrictions=None) new_model = init_detector(new_config_path, device='cuda:0') # new_model = new_detector.model print(old_model) # print(new_model) sd = OrderedDict() for k, v in old_model.state_dict().items(): if "bbox_head" not in k: # dont copy last layer sd[k] = v print(new_model) new_model.load_state_dict(sd, strict=False) save_checkpoint(new_model, new_checkpoint_path) # res = new_detector.detect_objects(np.zeros((400, 800, 3))) res = list(inference_detector(new_model, [np.zeros((400, 800, 3))])) print(res) print(len(res[0]))