def save_best(self, model, optimizer=None, scheduler=None, remove_old=True, infix='epoch'): if scheduler.info['cur_acc'] < scheduler.info['best_acc']: return False old_name = 'model_{}{}-{:4.2f}.pth'.format( infix, scheduler.info['best_epoch'], scheduler.info['best_acc']) new_name = 'model_{}{}-{:4.2f}.pth'.format(infix, scheduler.info['cur_epoch'], scheduler.info['cur_acc']) if os.path.exists(os.path.join(self.ckpt, old_name)) and remove_old: os.remove(os.path.join(self.ckpt, old_name)) scheduler.info['best_acc'] = scheduler.info['cur_acc'] scheduler.info['best_epoch'] = scheduler.info['cur_epoch'] save_dict = {'model': model.state_dict()} if optimizer is not None: save_dict['optimizer'] = optimizer.state_dict() if scheduler is not None: save_dict['scheduler'] = scheduler.state_dict() torch.save(save_dict, os.path.join(self.ckpt, new_name)) shutil.copyfile(os.path.join(self.ckpt, new_name), os.path.join(self.ckpt, 'model_latest.pth')) logging_rank('Saving best checkpoint done: {}.'.format(new_name), local_rank=self.local_rank) return True
def evaluation(dataset, all_boxes, all_segms, all_hiers): output_folder = os.path.join(cfg.CKPT, 'test') expected_results = () expected_results_sigma_tol = 4 coco_results = {} iou_types = ("bbox",) coco_results["bbox"] = all_boxes if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm",) coco_results["segm"] = all_segms if cfg.MODEL.HIER_ON and cfg.HRCNN.EVAL_HIER: iou_types = iou_types + ("hier",) coco_results['hier'] = all_hiers results = COCOResults(*iou_types) logging_rank("Evaluating predictions", local_rank=0) for iou_type in iou_types: with tempfile.NamedTemporaryFile() as f: file_path = f.name if output_folder: file_path = os.path.join(output_folder, iou_type + ".json") res = evaluate_predictions_on_coco( dataset.coco, coco_results[iou_type], file_path, iou_type ) results.update(res) logging_rank(results, local_rank=0) check_expected_results(results, expected_results, expected_results_sigma_tol) if output_folder: torch.save(results, os.path.join(output_folder, "coco_results.pth")) return results, coco_results
def test_net(args, ind_range=None): """Run inference on all images in a dataset or over an index range of images in a dataset using a single GPU. """ dataset = build_dataset(cfg.TEST.DATASETS, is_train=False) all_hooks = build_test_hooks(args.cfg_file.split('/')[-1], log_period=int( np.ceil(10 / cfg.TEST.IMS_PER_GPU))) if ind_range is not None: start_ind, end_ind = ind_range else: start_ind = 0 end_ind = len(dataset) model = initialize_model_from_cfg() all_boxes = test(model, dataset, start_ind, end_ind, all_hooks) if ind_range is not None: det_name = 'detection_range_%s_%s.pkl' % tuple(ind_range) else: det_name = 'detections.pkl' det_file = os.path.join(cfg.CKPT, 'test', det_name) save_object(dict(all_boxes=all_boxes, ), det_file) logging_rank('Wrote detections to: {}'.format(os.path.abspath(det_file)), local_rank=0) return all_boxes,
def multi_gpu_test_net_on_dataset(args, num_images): """Multi-gpu inference on a dataset.""" binary_dir = os.getcwd() binary = os.path.join(binary_dir, args.test_net_file + '.py') assert os.path.exists(binary), 'Binary \'{}\' not found'.format(binary) # Run inference in parallel in subprocesses # Outputs will be a list of outputs from each subprocess, where the output # of each subprocess is the dictionary saved by test_net(). outputs = subprocess_utils.process_in_parallel('detection', num_images, binary, cfg, cfg.CKPT) # Collate the results from each subprocess all_boxes = [] all_segms = [] all_hiers = [] for ins_data in outputs: all_boxes += ins_data['all_boxes'] all_segms += ins_data['all_segms'] all_hiers += ins_data['all_hiers'] det_file = os.path.join(cfg.CKPT, 'test', 'detections.pkl') save_object( dict( all_boxes=all_boxes, all_segms=all_segms, all_hiers=all_hiers, ), det_file) logging_rank('Wrote detections to: {}'.format(os.path.abspath(det_file)), local_rank=0) return all_boxes, all_segms, all_hiers
def align_and_update_state_dicts(model_state_dict, weights_dict, use_weights_once=False, local_rank=0): """ This function is taken from the maskrcnn_benchmark repo. It can be seen here: https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/utils/model_serialization.py Strategy: suppose that the models that we will create will have prefixes appended to each of its keys, for example due to an extra level of nesting that the original pre-trained weights from ImageNet won't contain. For example, model.state_dict() might return backbone[0].body.res2.conv1.weight, while the pre-trained model contains res2.conv1.weight. We thus want to match both parameters together. For that, we look for each model weight, look among all loaded keys if there is one that is a suffix of the current weight name, and use it if that's the case. If multiple matches exist, take the one with longest size of the corresponding name. For example, for the same model as before, the pretrained weight file can contain both res2.conv1.weight, as well as conv1.weight. In this case, we want to match backbone[0].body.conv1.weight to conv1.weight, and backbone[0].body.res2.conv1.weight to res2.conv1.weight. """ model_keys = sorted(list(model_state_dict.keys())) weights_keys = sorted(list(weights_dict.keys())) # get a matrix of string matches, where each (i, j) entry correspond to the size of the # loaded_key string, if it matches match_matrix = [ len(j) if i.endswith(j) else 0 for i in model_keys for j in weights_keys ] match_matrix = torch.as_tensor(match_matrix).view(len(model_keys), len(weights_keys)) max_match_size, idxs = match_matrix.max(1) # remove indices that correspond to no-match idxs[max_match_size == 0] = -1 # used for logging max_size_model = max([len(key) for key in model_keys]) if model_keys else 1 max_size_weights = max([len(key) for key in weights_keys]) if weights_keys else 1 match_keys = set() if use_weights_once: idx_model_and_weights = zip( *np.unique(idxs.numpy(), return_index=True)[::-1]) else: idx_model_and_weights = enumerate(idxs.tolist()) for idx_model, idx_weights in idx_model_and_weights: if idx_weights == -1: continue key_model = model_keys[idx_model] key_weights = weights_keys[idx_weights] model_state_dict[key_model] = weights_dict[key_weights] match_keys.add(key_model) logging_rank('{: <{}} loaded from {: <{}} of shape {}'.format( key_model, max_size_model, key_weights, max_size_weights, tuple(weights_dict[key_weights].shape)), local_rank=local_rank) mismatch_keys = set(model_keys) - match_keys return model_state_dict, mismatch_keys
def load_optimizer(self, optimizer): if self.resume: optimizer.load_state_dict(self.checkpoint.pop('optimizer')) logging_rank('Loading optimizer done.', local_rank=self.local_rank) else: logging_rank('Initializing optimizer done.', local_rank=self.local_rank) return optimizer
def convert_conv1_rgb2bgr(self, weights_dict): """Support caffe trained models: include resnet50/101/152 and vgg16""" conv1_weight = weights_dict['conv1.weight'].cpu().numpy().copy() conv1_weight[:, [0, 2], :, :] = conv1_weight[:, [2, 0], :, :] weights_dict['conv1.weight'] = torch.from_numpy(conv1_weight) logging_rank('Convert conv1.weight channel: {}'.format( conv1_weight.shape), local_rank=self.local_rank) return weights_dict
def process_in_parallel(tag, total_range_size, binary, cfg, ckpt_path): """Run the specified binary NUM_GPUS times in parallel, each time as a subprocess that uses one GPU. The binary must accept the command line arguments `--range {start} {end}` that specify a data processing range. """ # subprocesses cfg_file = os.path.join(ckpt_path, 'test', '{}_range_config.yaml'.format(tag)) with open(cfg_file, 'w') as f: yaml.dump(cfg, stream=f) subprocess_env = os.environ.copy() processes = [] # Determine GPUs to use cuda_visible_devices = os.environ.get('CUDA_VISIBLE_DEVICES') if cuda_visible_devices: gpu_inds = list(map(int, cuda_visible_devices.split(','))) assert -1 not in gpu_inds, 'Hiding GPU indices using the \'-1\' index is not supported' else: raise NotImplementedError subinds = np.array_split(range(total_range_size), len(gpu_inds)) # Run the binary in cfg.NUM_GPUS subprocesses for i, gpu_ind in enumerate(gpu_inds): start = subinds[i][0] end = subinds[i][-1] + 1 subprocess_env['CUDA_VISIBLE_DEVICES'] = str(gpu_ind) cmd = ('python {binary} --range {start} {end} --cfg {cfg_file} --gpu_id {gpu_id}') cmd = cmd.format( binary=shlex_quote(binary), start=int(start), end=int(end), cfg_file=shlex_quote(cfg_file), gpu_id=str(gpu_ind), ) logging_rank('{} range command {}: {}'.format(tag, i, cmd)) if i == 0: subprocess_stdout = subprocess.PIPE else: filename = os.path.join(ckpt_path, 'test', '%s_range_%s_%s.stdout' % (tag, start, end)) subprocess_stdout = open(filename, 'w') p = subprocess.Popen( cmd, shell=True, env=subprocess_env, stdout=subprocess_stdout, stderr=subprocess.STDOUT, bufsize=1 ) processes.append((i, p, start, end, subprocess_stdout)) # Log output from inference processes and collate their results outputs = [] for i, p, start, end, subprocess_stdout in processes: log_subprocess_output(i, p, ckpt_path, tag, start, end) if isinstance(subprocess_stdout, IOBase): subprocess_stdout.close() range_file = os.path.join(ckpt_path, 'test', '%s_range_%s_%s.pkl' % (tag, start, end)) range_data = pickle.load(open(range_file, 'rb')) outputs.append(range_data) return outputs
def load_scheduler(self, scheduler): if self.resume: scheduler.iteration = self.checkpoint['scheduler']['iteration'] scheduler.info = self.checkpoint['scheduler']['info'] logging_rank('Loading scheduler done.', local_rank=self.local_rank) else: logging_rank('Initializing scheduler done.', local_rank=self.local_rank) return scheduler
def parsing_iou(predict_root, im_dir, num_parsing): predict_list = glob.glob(predict_root + '/*.png') logging_rank('The predict size: {}'.format(len(predict_list))) hist = compute_hist(predict_list, im_dir, num_parsing) _iou, _miou = mean_IoU(hist) mean_acc = per_class_acc(hist) pixel_acc = pixel_wise_acc(hist) return _iou, _miou, mean_acc, pixel_acc
def get_params_list(self): for key, value in self.model.named_parameters(): if value.requires_grad: if 'bias' in key: self.bias_params_list.append(value) elif key in self.gn_param_nameset: self.gn_params_list.append(value) else: self.nonbias_params_list.append(value) else: logging_rank('{} does not need grad.'.format(key), local_rank=self.local_rank)
def main(): if not os.path.isdir(cfg.CKPT): mkdir_p(cfg.CKPT) if args.cfg_file is not None: shutil.copyfile(args.cfg_file, os.path.join(cfg.CKPT, args.cfg_file.split('/')[-1])) assert_and_infer_cfg(make_immutable=False) # Create model model = Generalized_RCNN() logging_rank(model, distributed=args.distributed, local_rank=args.local_rank) # Create checkpointer checkpointer = CheckPointer(cfg.CKPT, weights_path=cfg.TRAIN.WEIGHTS, auto_resume=cfg.TRAIN.AUTO_RESUME, local_rank=args.local_rank) # Load model or random-initialization model = checkpointer.load_model(model, convert_conv1=cfg.MODEL.CONV1_RGB2BGR) if cfg.MODEL.BATCH_NORM == 'freeze': model = convert_bn2affine_model(model, merge=not checkpointer.resume) elif cfg.MODEL.BATCH_NORM == 'sync': model = convert_bn2syncbn_model(model) model.to(args.device) # Create optimizer optimizer = Optimizer(model, cfg.SOLVER, local_rank=args.local_rank).build() optimizer = checkpointer.load_optimizer(optimizer) logging_rank('The mismatch keys: {}'.format(mismatch_params_filter(sorted(checkpointer.mismatch_keys))), distributed=args.distributed, local_rank=args.local_rank) # Create scheduler scheduler = LearningRateScheduler(optimizer, cfg.SOLVER, start_iter=0, local_rank=args.local_rank) scheduler = checkpointer.load_scheduler(scheduler) # Create training dataset and loader datasets = build_dataset(cfg.TRAIN.DATASETS, is_train=True, local_rank=args.local_rank) train_loader = make_train_data_loader(datasets, is_distributed=args.distributed, start_iter=scheduler.iteration) # Model Distributed if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, ) else: model = torch.nn.DataParallel(model) # Build hooks all_hooks = build_train_hooks(cfg, optimizer, scheduler, max_iter=cfg.SOLVER.MAX_ITER, warmup_iter=cfg.SOLVER.WARM_UP_ITERS, ignore_warmup_time=False) # Train train(model, train_loader, optimizer, scheduler, checkpointer, all_hooks)
def evaluation(dataset, all_boxes, all_segms, all_parss, all_pscores, clean_up=True): output_folder = os.path.join(cfg.CKPT, 'test') expected_results = () expected_results_sigma_tol = 4 coco_results = {} iou_types = ("bbox", ) coco_results["bbox"] = all_boxes if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm", ) coco_results["segm"] = all_segms if cfg.MODEL.PARSING_ON: iou_types = iou_types + ("parsing", ) coco_results['parsing'] = [all_parss, all_pscores] results = COCOResults(*iou_types) logging_rank("Evaluating predictions", local_rank=0) for iou_type in iou_types: if iou_type == "parsing": eval_ap = cfg.PRCNN.EVAL_AP num_parsing = cfg.PRCNN.NUM_PARSING assert len( cfg.TEST.DATASETS) == 1, 'Parsing only support one dataset now' im_dir = dataset_catalog.get_im_dir(cfg.TEST.DATASETS[0]) ann_fn = dataset_catalog.get_ann_fn(cfg.TEST.DATASETS[0]) res = evaluate_parsing(coco_results[iou_type], eval_ap, cfg.PRCNN.SCORE_THRESH, num_parsing, im_dir, ann_fn, output_folder) results.update_parsing(res) else: with tempfile.NamedTemporaryFile() as f: file_path = f.name if output_folder: file_path = os.path.join(output_folder, iou_type + ".json") res = evaluate_predictions_on_coco(dataset.coco, coco_results[iou_type], file_path, iou_type) results.update(res) logging_rank(results, local_rank=0) check_expected_results(results, expected_results, expected_results_sigma_tol) if output_folder: torch.save(results, os.path.join(output_folder, "coco_results.pth")) if clean_up: shutil.rmtree(output_folder) return results, coco_results
def train(model, loader, optimizer, scheduler, checkpointer, all_hooks): # switch to train mode model.train() # main loop start_iter = scheduler.iteration loader = iter(enumerate(loader, start_iter)) logging_rank("Starting training from iteration {}".format(start_iter), distributed=args.distributed, local_rank=args.local_rank) with EventStorage(start_iter=start_iter, log_period=cfg.DISPLAY_ITER) as storage: try: for h in all_hooks: h.before_train() for iteration in range(start_iter, cfg.SOLVER.MAX_ITER): for h in all_hooks: h.before_step(storage=storage) data_start = time.perf_counter() _, (images, targets, _) = next(loader) images = images.to(args.device) targets = [target.to(args.device) for target in targets] data_time = time.perf_counter() - data_start optimizer.zero_grad() outputs = model(images, targets) losses = sum(loss for loss in outputs['losses'].values()) metrics_dict = outputs['losses'] metrics_dict["data_time"] = data_time write_metrics(metrics_dict, storage) losses.backward() optimizer.step() if args.local_rank == 0: # Save model if cfg.SOLVER.SNAPSHOT_ITERS > 0 and (iteration + 1) % cfg.SOLVER.SNAPSHOT_ITERS == 0: checkpointer.save(model, optimizer, scheduler, copy_latest=True, infix='iter') for h in all_hooks: h.after_step(storage=storage) storage.step() if args.local_rank == 0: checkpointer.save(model, optimizer, scheduler, copy_latest=True, infix='iter') finally: for h in all_hooks: h.after_train(storage=storage) return None
def load_weights(model, weights_path, use_weights_once=False, local_rank=0): try: weights_dict = torch.load(weights_path, map_location=torch.device("cpu"))['model'] except: weights_dict = torch.load(weights_path, map_location=torch.device("cpu")) weights_dict = strip_prefix_if_present(weights_dict, prefix='module.') model_state_dict = model.state_dict() model_state_dict, mismatch_keys = align_and_update_state_dicts( model_state_dict, weights_dict, use_weights_once, -1) model.load_state_dict(model_state_dict) logging_rank('The mismatch keys: {}.'.format( list(mismatch_params_filter(sorted(mismatch_keys)))), local_rank=local_rank) logging_rank('Loading from weights: {}.'.format(weights_path), local_rank=local_rank)
def update_learning_rate(self): """Update learning rate """ cur_lr = self.optimizer.param_groups[0]['lr'] if cur_lr != self.new_lr: ratio = _get_lr_change_ratio(cur_lr, self.new_lr) if ratio > self.solver.LOG_LR_CHANGE_THRESHOLD and self.new_lr >= 1e-7: logging_rank('Changing learning rate {:.6f} -> {:.6f}'.format( cur_lr, self.new_lr), local_rank=self.local_rank) # Update learning rate, note that different parameter may have different learning rate for ind, param_group in enumerate(self.optimizer.param_groups): if 'lr_scale' in param_group: lr_scale = param_group['lr_scale'] else: lr_scale = 1 param_group['lr'] = self.new_lr * lr_scale
def test_net_on_dataset(args, multi_gpu=False): """Run inference on a dataset.""" dataset = build_dataset(cfg.TEST.DATASETS, is_train=False) total_timer = Timer() total_timer.tic() if multi_gpu: num_images = len(dataset) all_boxes, all_segms, all_keyps, all_parss, all_pscores, all_uvs = \ multi_gpu_test_net_on_dataset(args, num_images) else: all_boxes, all_segms, all_keyps, all_parss, all_pscores, all_uvs = test_net(args) total_timer.toc(average=False) logging_rank('Total inference time: {:.3f}s'.format(total_timer.average_time), local_rank=0) return evaluation(dataset, all_boxes, all_segms, all_keyps, all_parss, all_pscores, all_uvs)
def log_stats(self, cur_idx, start_ind, end_ind, total_num_images, suffix=None): """Log the tracked statistics.""" if cur_idx % self.log_period == 0: eta_seconds = self.iter_timer.average_time / self.ims_per_gpu * (end_ind - cur_idx - 1) eta = str(datetime.timedelta(seconds=int(eta_seconds))) lines = '[Testing][range:{}-{} of {}][{}/{}]'. \ format(start_ind + 1, end_ind, total_num_images, cur_idx + 1, end_ind) lines += '[{:.3f}s = {:.3f}s + {:.3f}s + {:.3f}s][eta: {}]'. \ format(float(self.iter_timer.average_time) / self.ims_per_gpu, float(self.data_timer.average_time) / self.ims_per_gpu, float(self.infer_timer.average_time) / self.ims_per_gpu, float(self.post_timer.average_time) / self.ims_per_gpu, eta) if suffix is not None: lines += suffix logging_rank(lines) return None
def build_dataset(dataset_list, is_train=True, local_rank=0): if not isinstance(dataset_list, (list, tuple)): raise RuntimeError( "dataset_list should be a list of strings, got {}".format( dataset_list)) for dataset_name in dataset_list: assert contains(dataset_name), 'Unknown dataset name: {}'.format( dataset_name) assert os.path.exists( get_im_dir(dataset_name)), 'Im dir \'{}\' not found'.format( get_im_dir(dataset_name)) logging_rank('Creating: {}'.format(dataset_name), local_rank=local_rank) transforms = build_transforms(is_train) datasets = [] for dataset_name in dataset_list: args = {} args['root'] = get_im_dir(dataset_name) args['ann_file'] = get_ann_fn(dataset_name) args['remove_images_without_annotations'] = is_train ann_types = ('bbox', ) if cfg.MODEL.MASK_ON: ann_types = ann_types + ('segm', ) if cfg.MODEL.KEYPOINT_ON: ann_types = ann_types + ('keypoints', ) if cfg.MODEL.PARSING_ON: ann_types = ann_types + ('parsing', ) if cfg.MODEL.UV_ON: ann_types = ann_types + ('uv', ) args['ann_types'] = ann_types args['transforms'] = transforms # make dataset from factory dataset = D.COCODataset(**args) datasets.append(dataset) # for training, concatenate all datasets into a single one dataset = datasets[0] if len(datasets) > 1: dataset = D.ConcatDataset(datasets) return dataset
def load_model(self, model, convert_conv1=False, use_weights_once=False): if self.resume: weights_dict = self.checkpoint.pop('model') weights_dict = strip_prefix_if_present(weights_dict, prefix='module.') model_state_dict = model.state_dict() model_state_dict, self.mismatch_keys = align_and_update_state_dicts( model_state_dict, weights_dict, use_weights_once, self.local_rank) model.load_state_dict(model_state_dict) logging_rank('Resuming from weights: {}.'.format( self.weights_path), local_rank=self.local_rank) else: if self.weights_path: weights_dict = self.checkpoint weights_dict = strip_prefix_if_present(weights_dict, prefix='module.') weights_dict = self.weight_mapping( weights_dict) # only for pre-training if convert_conv1: # only for pre-training weights_dict = self.convert_conv1_rgb2bgr(weights_dict) model_state_dict = model.state_dict() model_state_dict, self.mismatch_keys = align_and_update_state_dicts( model_state_dict, weights_dict, use_weights_once, self.local_rank) model.load_state_dict(model_state_dict) logging_rank('Pre-training on weights: {}.'.format( self.weights_path), local_rank=self.local_rank) else: logging_rank('Training from scratch.', local_rank=self.local_rank) return model
def _get_repeat_factors(self, dataset_dicts): """ Compute (fractional) per-image repeat factors. Args: dataset_dicts (list) : per-image annotations Returns: torch.Tensor: the i-th element is the repeat factor for the dataset_dicts image at index i. """ # 1. For each category c, compute the fraction of images that contain it: f(c) category_freq = defaultdict(int) for dataset_dict in dataset_dicts: # For each image (without repeats) cat_ids = {ann["category_id"] for ann in dataset_dict["annotations"]} for cat_id in cat_ids: category_freq[cat_id] += 1 num_images = len(dataset_dicts) for k, v in category_freq.items(): category_freq[k] = v / num_images # 2. For each category c, compute the category-level repeat factor: # lvis paper: r(c) = max(1, sqrt(t / f(c))) # common: r(c) = max(i, min(a,pow(t / f(c),alpha))) category_rep = { cat_id: max(self.config.MIN_REPEAT_TIMES, min(self.config.MAX_REPEAT_TIMES, math.pow( (self.config.REPEAT_THRESHOLD / cat_freq), self.config.POW))) for cat_id, cat_freq in category_freq.items() } # 3. For each image I, compute the image-level repeat factor: # r(I) = max_{c in I} r(c) rep_factors = [] for dataset_dict in dataset_dicts: cat_ids = {ann["category_id"] for ann in dataset_dict["annotations"]} rep_factor = max({category_rep[cat_id] for cat_id in cat_ids}) rep_factors.append(rep_factor) logging_rank('max(rep_factors): {} , min(rep_factors): {} , len(rep_factors): {}'. format(max(rep_factors), min(rep_factors), len(rep_factors)), distributed=1, local_rank=self.rank) return torch.tensor(rep_factors, dtype=torch.float32)
def log_subprocess_output(i, p, ckpt_path, tag, start, end): """Capture the output of each subprocess and log it in the parent process. The first subprocess's output is logged in realtime. The output from the other subprocesses is buffered and then printed all at once (in order) when subprocesses finish. """ outfile = os.path.join(ckpt_path, 'test', '%s_range_%s_%s.stdout' % (tag, start, end)) logging_rank('# ' + '-' * 76 + ' #') logging_rank('stdout of subprocess %s with range [%s, %s]' % (i, start + 1, end)) logging_rank('# ' + '-' * 76 + ' #') if i == 0: # Stream the piped stdout from the first subprocess in realtime with open(outfile, 'w') as f: for line in iter(p.stdout.readline, b''): print(line.rstrip().decode('ascii')) f.write(str(line, encoding='ascii')) p.stdout.close() ret = p.wait() else: # For subprocesses >= 1, wait and dump their log file ret = p.wait() with open(outfile, 'r') as f: print(''.join(f.readlines())) assert ret == 0, 'Range subprocess failed (exit code: {})'.format(ret)
def save(self, model, optimizer=None, scheduler=None, copy_latest=True, infix='epoch'): save_dict = {'model': model.state_dict()} if optimizer is not None: save_dict['optimizer'] = optimizer.state_dict() if scheduler is not None: save_dict['scheduler'] = scheduler.state_dict() torch.save(save_dict, os.path.join(self.ckpt, 'model_latest.pth')) logg_sstr = 'Saving checkpoint done.' if copy_latest and scheduler: shutil.copyfile( os.path.join(self.ckpt, 'model_latest.pth'), os.path.join( self.ckpt, 'model_{}{}.pth'.format(infix, str(scheduler.iteration)))) logg_sstr += ' And copy "model_latest.pth" to "model_{}{}.pth".'.format( infix, str(scheduler.iteration)) logging_rank(logg_sstr, local_rank=self.local_rank)
def evaluate_parsing(all_results, eval_ap, score_thresh, num_parsing, im_dir, ann_fn, output_folder): logging_rank('Evaluating parsing') predict_dir = os.path.join(output_folder, 'parsing_predict') assert os.path.exists(predict_dir), 'predict dir \'{}\' not found'.format(predict_dir) _iou, _miou, mean_acc, pixel_acc = parsing_iou(predict_dir, im_dir, num_parsing) parsing_result = {'mIoU': _miou, 'pixel_acc': pixel_acc, 'mean_acc': mean_acc} parsing_name = get_parsing(ann_fn) logging_rank('IoU for each category:') assert len(parsing_name) == len(_iou), '{} VS {}'.format(str(len(parsing_name)), str(len(_iou))) for i, iou in enumerate(_iou): print(' {:<30}: {:.2f}'.format(parsing_name[i], 100 * iou)) print('----------------------------------------') print(' {:<30}: {:.2f}'.format('mean IoU', 100 * _miou)) print(' {:<30}: {:.2f}'.format('pixel acc', 100 * pixel_acc)) print(' {:<30}: {:.2f}'.format('mean acc', 100 * mean_acc)) if eval_ap: all_ap_p, all_pcp = eval_parsing_ap(all_results[0], all_results[1], score_thresh, im_dir, ann_fn, num_parsing) ap_p_vol = np.mean(all_ap_p) print('~~~~ Summary metrics ~~~~') print(' Average Precision based on part (APp) @[mIoU=0.10:0.90 ] = {:.3f}'.format(ap_p_vol)) print(' Average Precision based on part (APp) @[mIoU=0.10 ] = {:.3f}'.format(all_ap_p[0])) print(' Average Precision based on part (APp) @[mIoU=0.30 ] = {:.3f}'.format(all_ap_p[2])) print(' Average Precision based on part (APp) @[mIoU=0.50 ] = {:.3f}'.format(all_ap_p[4])) print(' Average Precision based on part (APp) @[mIoU=0.70 ] = {:.3f}'.format(all_ap_p[6])) print(' Average Precision based on part (APp) @[mIoU=0.90 ] = {:.3f}'.format(all_ap_p[8])) print(' Percentage of Correctly parsed semantic Parts (PCP) @[mIoU=0.50 ] = {:.3f}'.format(all_pcp[4])) parsing_result['APp50'] = all_ap_p[4] parsing_result['APpvol'] = ap_p_vol parsing_result['PCP'] = all_pcp[4] return parsing_result
def __iter__(self): if self.shuffle: # deterministically shuffle based on epoch g = torch.Generator() g.manual_seed(self.epoch) indices = self._get_epoch_indices(g) randperm = torch.randperm(len(indices), generator=g).tolist() indices = indices[randperm] else: g = torch.Generator() g.manual_seed(self.epoch) indices = self._get_epoch_indices(g) # indices = torch.arange(len(self.dataset)).tolist() # when balance len(indices) diff from dataset image_num self.total_size = len(indices) logging_rank('balance sample total_size: {}'.format(self.total_size), distributed=1, local_rank=self.rank) # subsample self.num_samples = int(len(indices) / self.num_replicas) offset = self.num_samples * self.rank indices = indices[offset: offset + self.num_samples] assert len(indices) == self.num_samples return iter(indices)
args.distributed = num_gpus > 1 if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") args.world_size = torch.distributed.get_world_size() else: args.world_size = 1 args.local_rank = 0 cfg.NUM_GPUS = len(os.environ['CUDA_VISIBLE_DEVICES'].split( ',')) if cfg.DEVICE == 'cuda' else 1 cfg.TRAIN.LOADER_THREADS *= cfg.NUM_GPUS cfg.TEST.LOADER_THREADS *= cfg.NUM_GPUS cfg.TEST.IMS_PER_GPU *= cfg.NUM_GPUS logging_rank('Called with args: {}'.format(args), distributed=args.distributed, local_rank=args.local_rank) def train(model, loader, optimizer, scheduler, checkpointer, logger): # switch to train mode model.train() # main loop start_iter = scheduler.iteration for iteration, (images, targets, _) in enumerate(loader, start_iter): logger.iter_tic() logger.data_tic() scheduler.step() # adjust learning rate optimizer.zero_grad()