def after_step(self): next_iter = self.trainer.iter + 1 is_final = next_iter == self.trainer.max_iter if is_final or (self._period > 0 and next_iter % self._period == 0): results = self._func() if results: assert isinstance( results, dict ), "Eval function must return a dict. Got {} instead.".format( results) flattened_results = flatten_results_dict(results) for k, v in flattened_results.items(): try: v = float(v) except Exception: raise ValueError( "[EvalHook] eval_function should return a nested dict of float. " "Got '{}: {}' instead.".format(k, v)) self.trainer.storage.put_scalars(**flattened_results, smoothing_hint=False) # Evaluation may take different time among workers. # A barrier make them start the next iteration together. comm.synchronize()
def _distributed_worker( local_rank, main_func, world_size, num_gpus_per_machine, machine_rank, dist_url, args ): assert torch.cuda.is_available(), "cuda is not available. Please check your installation." global_rank = machine_rank * num_gpus_per_machine + local_rank try: dist.init_process_group( backend="NCCL", init_method=dist_url, world_size=world_size, rank=global_rank ) except Exception as e: logger = logging.getLogger(__name__) logger.error("Process group URL: {}".format(dist_url)) raise e # synchronize is needed here to prevent a possible timeout after calling init_process_group # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172 comm.synchronize() assert num_gpus_per_machine <= torch.cuda.device_count() torch.cuda.set_device(local_rank) # Setup the local process group (which contains ranks within the same machine) assert comm._LOCAL_PROCESS_GROUP is None num_machines = world_size // num_gpus_per_machine for i in range(num_machines): ranks_on_i = list(range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine)) pg = dist.new_group(ranks_on_i) if i == machine_rank: comm._LOCAL_PROCESS_GROUP = pg main_func(*args)
def evaluate(self): if self._distributed: comm.synchronize() self._predictions = comm.gather(self._predictions, dst=0) self._predictions = list(itertools.chain(*self._predictions)) if not comm.is_main_process(): return {} if len(self._predictions) == 0: self._logger.warning( "[COCOEvaluator] Did not receive valid predictions.") return {} if self._output_dir: PathManager.mkdirs(self._output_dir) file_path = os.path.join(self._output_dir, "instances_predictions.pth") with PathManager.open(file_path, "wb") as f: torch.save(self._predictions, f) self._results = OrderedDict() if "proposals" in self._predictions[0]: self._eval_box_proposals() if "instances" in self._predictions[0]: self._eval_predictions(set(self._tasks)) # Copy so the caller can do whatever with results return copy.deepcopy(self._results)
def evaluate(self): """ Returns: dict: has a key "segm", whose value is a dict of "AP" and "AP50". """ comm.synchronize() if comm.get_rank() > 0: return os.environ["CITYSCAPES_DATASET"] = os.path.abspath( os.path.join(self._metadata.gt_dir, "..", "..")) # Load the Cityscapes eval script *after* setting the required env var, # since the script reads CITYSCAPES_DATASET into global variables at load time. import cityscapesscripts.evaluation.evalInstanceLevelSemanticLabeling as cityscapes_eval self._logger.info("Evaluating results under {} ...".format( self._temp_dir)) # set some global states in cityscapes evaluation API, before evaluating cityscapes_eval.args.predictionPath = os.path.abspath(self._temp_dir) cityscapes_eval.args.predictionWalk = None cityscapes_eval.args.JSONOutput = False cityscapes_eval.args.colorized = False cityscapes_eval.args.gtInstancesFile = os.path.join( self._temp_dir, "gtInstances.json") # These lines are adopted from # https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/evaluation/evalInstanceLevelSemanticLabeling.py # noqa groundTruthImgList = glob.glob(cityscapes_eval.args.groundTruthSearch) assert len( groundTruthImgList ), "Cannot find any ground truth images to use for evaluation. Searched for: {}".format( cityscapes_eval.args.groundTruthSearch) predictionImgList = [] for gt in groundTruthImgList: predictionImgList.append( cityscapes_eval.getPrediction(gt, cityscapes_eval.args)) results = cityscapes_eval.evaluateImgLists( predictionImgList, groundTruthImgList, cityscapes_eval.args)["averages"] ret = OrderedDict() ret["segm"] = { "AP": results["allAp"] * 100, "AP50": results["allAp50%"] * 100 } self._working_dir.cleanup() return ret
def evaluate(self): """ Evaluates standard semantic segmentation metrics (http://cocodataset.org/#stuff-eval): * Mean intersection-over-union averaged across classes (mIoU) * Frequency Weighted IoU (fwIoU) * Mean pixel accuracy averaged across classes (mACC) * Pixel Accuracy (pACC) """ if self._distributed: synchronize() conf_matrix_list = all_gather(self._conf_matrix) self._predictions = all_gather(self._predictions) self._predictions = list(itertools.chain(*self._predictions)) self._real_flops = all_gather(self._real_flops) self._real_flops = list(itertools.chain(*self._real_flops)) self._expt_flops = all_gather(self._expt_flops) self._expt_flops = list(itertools.chain(*self._expt_flops)) if not is_main_process(): return self._conf_matrix = np.zeros_like(self._conf_matrix) for conf_matrix in conf_matrix_list: self._conf_matrix += conf_matrix if self._output_dir: PathManager.mkdirs(self._output_dir) file_path = os.path.join(self._output_dir, "sem_seg_predictions.json") with PathManager.open(file_path, "w") as f: f.write(json.dumps(self._predictions)) acc = np.zeros(self._num_classes, dtype=np.float) iou = np.zeros(self._num_classes, dtype=np.float) tp = self._conf_matrix.diagonal()[:-1].astype(np.float) pos_gt = np.sum(self._conf_matrix[:-1, :-1], axis=0).astype(np.float) class_weights = pos_gt / np.sum(pos_gt) pos_pred = np.sum(self._conf_matrix[:-1, :-1], axis=1).astype(np.float) acc_valid = pos_gt > 0 acc[acc_valid] = tp[acc_valid] / pos_gt[acc_valid] iou_valid = (pos_gt + pos_pred) > 0 union = pos_gt + pos_pred - tp iou[acc_valid] = tp[acc_valid] / union[acc_valid] macc = np.sum(acc) / np.sum(acc_valid) miou = np.sum(iou) / np.sum(iou_valid) fiou = np.sum(iou * class_weights) pacc = np.sum(tp) / np.sum(pos_gt) res = {} res["mIoU"] = 100 * miou res["fwIoU"] = 100 * fiou res["mACC"] = 100 * macc res["pACC"] = 100 * pacc # add flops calculation if len(self._real_flops) > 0 and len(self._expt_flops) > 0: res["mean_real_flops"] = (sum(self._real_flops) / len(self._real_flops)) / 1e3 res["max_real_flops"] = max(self._real_flops) / 1e3 res["min_real_flops"] = min(self._real_flops) / 1e3 res["mean_expt_flops"] = (sum(self._expt_flops) / len(self._expt_flops)) / 1e3 res["max_expt_flops"] = max(self._expt_flops) / 1e3 res["min_expt_flops"] = min(self._expt_flops) / 1e3 if self._output_dir: file_path = os.path.join(self._output_dir, "sem_seg_evaluation.pth") with PathManager.open(file_path, "wb") as f: torch.save(res, f) results = OrderedDict({"sem_seg": res}) self._logger.info(results) return results
def do_train(cfg, model, resume=False): model.train() optimizer = build_optimizer(cfg, model) scheduler = build_lr_scheduler(cfg, optimizer) checkpointer = DetectionCheckpointer(model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler) start_iter = (checkpointer.resume_or_load( cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1) max_iter = cfg.SOLVER.MAX_ITER periodic_checkpointer = PeriodicCheckpointer(checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter) writers = ([ CommonMetricPrinter(max_iter), JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(cfg.OUTPUT_DIR), ] if comm.is_main_process() else []) # compared to "train_net.py", we do not support accurate timing and # precise BN here, because they are not trivial to implement data_loader = build_detection_train_loader(cfg) logger.info("Starting training from iteration {}".format(start_iter)) with EventStorage(start_iter) as storage: for data, iteration in zip(data_loader, range(start_iter, max_iter)): iteration = iteration + 1 storage.step() loss_dict = model(data) losses = sum(loss for loss in loss_dict.values()) assert torch.isfinite(losses).all(), loss_dict loss_dict_reduced = { k: v.item() for k, v in comm.reduce_dict(loss_dict).items() } losses_reduced = sum(loss for loss in loss_dict_reduced.values()) if comm.is_main_process(): storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() losses.backward() optimizer.step() storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) scheduler.step() if (cfg.TEST.EVAL_PERIOD > 0 and iteration % cfg.TEST.EVAL_PERIOD == 0 and iteration != max_iter): do_test(cfg, model) # Compared to "train_net.py", the test results are not dumped to EventStorage comm.synchronize() if iteration - start_iter > 5 and (iteration % 20 == 0 or iteration == max_iter): for writer in writers: writer.write() periodic_checkpointer.step(iteration)