def __init__(self, cfg): super().__init__() self.cfg = cfg # 创建基础模型 from utils.prepare_training import get_model self.backbone = get_model(cfg.backbone) if cfg.neck: # 不能写成 is not None, 因为结果是{}不是None, 但可以用True/False来判断 self.neck = get_model(cfg.neck) if cfg.head: self.cls_head = get_model(cfg.head) # 初始化: 注意权重需要送入cpu/gpu,该步在model.to()完成 self.init_weights()
def eval_dataset_cls(cfg_path, device=None): """分类问题的eval dataset: 等效于runner中的load_from + val,但可用来脱离runner进行独立的数据集验证 """ # 准备验证所用的对象 cfg = get_config(cfg_path) dataset = get_dataset(cfg.valset, cfg.transform_val) dataloader = get_dataloader(dataset, cfg.valloader) model = get_model(cfg) if device is None: device = torch.device(cfg.load_device) # TODO: 如下两句的顺序 load_checkpoint(model, cfg.load_from, device) model = model.to(device) # 开始验证 buffer = {'acc': []} n_correct = 0 model.eval() for c_iter, data_batch in enumerate(dataloader): with torch.no_grad(): # 停止反向传播,只进行前向计算 img = to_device(data_batch['img'], device) label = to_device(data_batch['gt_labels'], device) y_pred = model(img) label = torch.cat(label, dim=0) acc1 = accuracy(y_pred, label, topk=1) buffer['acc'].append(acc1) # 计算总体精度 n_correct += buffer['acc'][-1] * len(data_batch['gt_labels']) vis_loss_acc(buffer, title='eval dataset') print('ACC on dataset: %.3f', n_correct / len(dataset))
def onnx_exporter(cfg): """把一个pytorch模型转换成onnx模型。 对模型的要求: 1. 模型需要有forward_dummy()函数的实施,如下是一个实例: def forward_dummy(self, img): x = self.extract_feat(img) x = self.bbox_head(x) return x 2. 模型的终端输出,也就是head端的输出必须是tuple/list/variable类型,不能是dict,否则当前pytorch.onnx不支持。 """ img_shape = (1, 3) + cfg.img_size dummy_input = torch.randn(img_shape, device='cuda') # 创建配置和创建模型 model = get_model(cfg).cuda() if cfg.load_from is not None: _ = load_checkpoint(model, cfg.load_from) else: raise ValueError('need to assign checkpoint path to load from.') model.forward = model.forward_dummy torch.onnx.export(model, dummy_input, cfg.work_dir + cfg.model_name + '.onnx', verbose=True)
def __init__(self, cfg_path, load_from=None, load_device=None): self.type = 'det' # 用来判断是什么类型的预测器 # 准备验证所用的对象 self.cfg = get_config(cfg_path) # 为了便于eval,不必常去修改cfg里边的设置,直接在func里边添加2个参数即可 if load_from is not None: self.cfg.load_from = load_from if load_device is not None: self.cfg.load_device = load_device self.model = get_model(self.cfg) self.device = torch.device(self.cfg.load_device) load_checkpoint(self.model, self.cfg.load_from, self.device) self.model = self.model.to(self.device)
def eval_dataset_det(cfg_path, load_from=None, load_device=None, resume_from=None, result_file=None): """检测问题的eval dataset: 为了便于eval,添加2个形参参数,不必常去修改cfg里边的设置 """ # 准备验证所用的对象 cfg = get_config(cfg_path) cfg.valloader.params.batch_size = 1 # 强制固定验证时batch_size=1 # 为了便于eval,不必常去修改cfg里边的设置,直接在func里边添加几个参数即可 if load_from is not None: cfg.load_from = load_from if load_device is not None: cfg.load_device = load_device if resume_from is not None: cfg.resume_from = resume_from dataset = get_dataset(cfg.valset, cfg.transform_val) dataloader = get_dataloader(dataset, cfg.valloader, len(cfg.gpus)) model = get_model(cfg) device = torch.device(cfg.load_device) load_checkpoint(model, cfg.load_from, device) model = model.to(device) # 如果没有验证过 if result_file is None: # 开始验证 model.eval() all_bbox_cls = [] for c_iter, data_batch in enumerate(dataloader): with torch.no_grad(): # 停止反向传播,只进行前向计算 bbox_det = batch_detector( model, data_batch, device, return_loss=False)['bboxes'] # 提取bbox即可(n_cls,)(m,5) # 显示进度 if c_iter % 100 == 0: print('%d / %d finished predict.' % (c_iter, len(dataset))) all_bbox_cls.append(bbox_det) # (n_img,)(n_class,)(k,5) # 保存预测结果到文件 filename = get_time_str() + '_eval_result.pkl' save2pkl(all_bbox_cls, cfg.work_dir + filename) # 如果有现成验证文件 else: all_bbox_cls = loadvar(result_file) # 评估 voc_eval(all_bbox_cls, dataset, iou_thr=0.5)
def __init__(self, cfg_path, load_from=None, load_device=None): super().__init__() self.type = 'cls' # 准备验证所用的对象 self.cfg = get_config(cfg_path) # 为了便于eval,不必常去修改cfg里边的设置,直接在func里边添加2个参数即可 if load_from is not None: self.cfg.load_from = load_from if load_device is not None: self.cfg.load_device = load_device self.model = get_model(self.cfg) self.device = torch.device(self.cfg.load_device) if self.cfg.load_from is not None or self.cfg.resume_from is not None: load_checkpoint(self.model, self.cfg.load_from, self.device) self.model = self.model.to(self.device)
def __init__(self, cfg, resume_from=None): # 共享变量: 需要声明在resume/load之前,否则会把resume的东西覆盖 self.c_epoch = 0 self.c_iter = 0 self.weight_ready = False self.buffer = {'loss': [], 'acc1': [], 'acc5': [], 'lr': []} # 获得配置 self.cfg = cfg if resume_from is not None: self.cfg.resume_from = resume_from # runner可以直接修改resume_from,避免修改cfg文件 # 检查文件夹和文件是否合法 self.check_dir_file(self.cfg) #设置logger self.logger = get_logger(self.cfg.logger) self.logger.info('start logging info.') #设置设备: 如果是分布式,则不同local rank(不同进程号)返回的是不同设备 self.device = get_device(self.cfg, self.logger) #创建batch处理器 self.batch_processor = get_batch_processor(self.cfg) #创建数据集 self.trainset = get_dataset(self.cfg.trainset, self.cfg.transform) self.valset = get_dataset( self.cfg.valset, self.cfg.transform_val) # 做验证的变换只做基础变换,不做数据增强 # tmp1 = self.trainset[91] # for debug: 可查看dataset __getitem__ # img = tmp1['img'] # label = tmp1['gt_labels'] # bbox = tmp1['gt_bboxes'] # ldmk = tmp1['gt_landmarks'] # from utils.transform import transform_inv # class_names = self.trainset.CLASSES # label = label - 1 # 恢复从0为起点,从而跟CLASS匹配 # transform_inv(img, bbox, label, ldmk, mean=self.cfg.transform.img_params.mean, # std=self.cfg.transform.img_params.std, class_names=class_names, show=True) #创建数据加载器 self.dataloader = get_dataloader(self.trainset, self.cfg.trainloader, len(self.cfg.gpus), dist=self.cfg.distribute) self.valloader = get_dataloader(self.valset, self.cfg.valloader, len(self.cfg.gpus), dist=self.cfg.distribute) # tmp2 = next(iter(self.dataloader)) # for debug: 设置worker=0就可查看collate_fn # 创建模型并初始化 if self.cfg.load_from is not None or self.cfg.resume_from is not None: self.cfg.backbone.params.pretrained = None # 如果load_from或resume_from,则不加载pretrained self.model = get_model(self.cfg) # 优化器:必须在model送入cuda之前创建 self.optimizer = get_optimizer(self.cfg.optimizer, self.model) # 学习率调整器 self.lr_processor = get_lr_processor(self, self.cfg.lr_processor) # 送入GPU # 包装并行模型是在optimizer提取参数之后,否则可能导致无法提取,因为并行模型在model之下加了一层module壳 self.model = get_model_wrapper(self.model, self.cfg) self.model.to(self.device) # 注意:恢复或加载是直接加载到目标设备,所以必须在模型传入设备之后进行,确保设备匹配 # 加载模型权重和训练参数,从之前断开的位置继续训练 if self.cfg.resume_from: self.resume_training(checkpoint_path=self.cfg.resume_from, map_location=self.device) # 沿用设置的device # 加载模型权重,但没有训练参数,所以一般用来做预测 elif self.cfg.load_from: load_device = torch.device(self.cfg.load_device) self._load_checkpoint(checkpoint_path=self.cfg.load_from, map_location=load_device) self.weight_ready = True