def evaluate(self): if self._distributed: comm.synchronize() self._predictions = comm.gather(self._predictions, dst=0) self._predictions = list(itertools.chain(*self._predictions)) if not comm.is_main_process(): return {} if len(self._predictions) == 0: self._logger.warning( "[COCOEvaluator] Did not receive valid predictions.") return {} if self._output_dir: PathManager.mkdirs(self._output_dir) file_path = os.path.join(self._output_dir, "instances_predictions.pth") with PathManager.open(file_path, "wb") as f: torch.save(self._predictions, f) self._results = OrderedDict() if "instances" in self._predictions[0]: self._eval_predictions() # Copy so the caller can do whatever with results return copy.deepcopy(self._results)
def evaluate(self): if self._distributed: comm.synchronize() predictions = comm.gather(self._predictions, dst=0) predictions = list(itertools.chain(*predictions)) if not comm.is_main_process(): return {} else: predictions = self._predictions # preictions: list of dict [{'image_id', 'instances'(list of dict [{'image_id', 'category_id', bbox, score}])}] if len(predictions) == 0: self._logger.warning( "[SMDEvaluator] Did not receive valid predictions.") return {} if self._output_dir: PathManager.mkdirs(self._output_dir) file_path = os.path.join(self._output_dir, "instances_predictions.pth") with PathManager.open(file_path, "wb") as f: torch.save(predictions, f) self._results = OrderedDict() if "proposals" in predictions[0]: self._eval_box_proposals(predictions) if "instances" in predictions[0]: self._eval_predictions(set(self._tasks), predictions) self._eval_predictions_others(self._coco_api, predictions) # Copy so the caller can do whatever with results return copy.deepcopy(self._results)
def after_step(self): """Run after every iteration, see parent for details""" self.num_steps += 1 if self.num_steps % self._period == 0: data = next(self._loader) if torch.cuda.is_available(): torch.cuda.synchronize() with torch.no_grad(): loss_dict = self.trainer.model(data) losses = sum(loss_dict.values()) assert torch.isfinite(losses).all(), loss_dict loss_dict_reduced = { "val_" + k: v.item() for k, v in comm.reduce_dict(loss_dict).items() } losses_reduced = sum(loss for loss in loss_dict_reduced.values()) if comm.is_main_process(): self.trainer.storage.put_scalars( total_val_loss=losses_reduced, **loss_dict_reduced) comm.synchronize() else: pass
def evaluate(self): if self._distributed: comm.synchronize() self._predictions = comm.gather(self._predictions, dst=0) self._predictions = list(itertools.chain(*self._predictions)) if not comm.is_main_process(): return {} if len(self._predictions) == 0: self._logger.warning( "[COCOEvaluator] Did not receive valid predictions.") return {} if self._output_dir: PathManager.mkdirs(self._output_dir) file_path = os.path.join(self._output_dir, "instances_predictions.pth") with PathManager.open(file_path, "wb") as f: torch.save(self._predictions, f) self._results = OrderedDict() if "proposals" in self._predictions[0]: self._eval_box_proposals() if "instances" in self._predictions[0]: miou = {'miou': self._eval_predictions(set(self._tasks))} return miou
def run(args): from template_lib.d2.utils import set_ddp_seed set_ddp_seed(outdir=f"{global_cfg.tl_outdir}/d2") total_batch_size = global_cfg.build_dataloader.batch_size num_workers = comm.get_world_size() batch_size = total_batch_size // num_workers data_loader = build_dataloader(global_cfg.build_dataloader, kwargs_priority=True, batch_size=batch_size, distributed=args.distributed) FID_IS_torch = build_GAN_metric(global_cfg.GAN_metric) if global_cfg.tl_debug: num_images = 50 else: num_images = float('inf') FID_IS_torch.calculate_fid_stat_of_dataloader( data_loader=data_loader, num_images=num_images, save_fid_stat=global_cfg.save_fid_stat) comm.synchronize() pass
def setup_after_launch(cfg, output_dir, runner): """ Set things up after entering DDP, including - creating working directory - setting up logger - logging environment - initializing runner """ create_dir_on_global_main_process(output_dir) comm.synchronize() setup_loggers(output_dir) cfg.freeze() if cfg.OUTPUT_DIR != output_dir: with temp_defrost(cfg): logger.warning( "Override cfg.OUTPUT_DIR ({}) to be the same as output_dir {}". format(cfg.OUTPUT_DIR, output_dir)) cfg.OUTPUT_DIR = output_dir logger.info("Initializing runner ...") runner = initialize_runner(runner, cfg) log_info(cfg, runner) dump_cfg(cfg, os.path.join(output_dir, "config.yaml")) auto_scale_world_size(cfg, new_world_size=comm.get_world_size())
def _do_eval(self): results = self._func() if results: assert isinstance( results, dict ), "Eval function must return a dict. Got {} instead.".format( results) print('Before flatten: ', results) flattened_results = flatten_results_dict(results) print('After flatten: ', flattened_results) for k, v in flattened_results.items(): try: v = float(v) except Exception: raise ValueError( "[EvalHook] eval_function should return a nested dict of float. " "Got '{}: {}' instead.".format(k, v)) self.trainer.storage.put_scalars(**flattened_results, smoothing_hint=False) # Evaluation may take different time among workers. # A barrier make them start the next iteration together. comm.synchronize()
def evaluate(self): """ Evaluates standard semantic segmentation metrics (http://cocodataset.org/#stuff-eval): * Mean intersection-over-union averaged across classes (mIoU) * Frequency Weighted IoU (fwIoU) * Mean pixel accuracy averaged across classes (mACC) * Pixel Accuracy (pACC) """ if self._distributed: synchronize() conf_matrix_list = all_gather(self._conf_matrix) self._predictions = all_gather(self._predictions) self._predictions = list(itertools.chain(*self._predictions)) if not is_main_process(): return self._conf_matrix = np.zeros_like(self._conf_matrix) for conf_matrix in conf_matrix_list: self._conf_matrix += conf_matrix if self._output_dir: PathManager.mkdirs(self._output_dir) file_path = os.path.join(self._output_dir, "sem_seg_predictions.json") with PathManager.open(file_path, "w") as f: f.write(json.dumps(self._predictions)) acc = np.full(self._num_classes, np.nan, dtype=np.float) iou = np.full(self._num_classes, np.nan, dtype=np.float) tp = self._conf_matrix.diagonal()[:-1].astype(np.float) pos_gt = np.sum(self._conf_matrix[:-1, :-1], axis=0).astype(np.float) class_weights = pos_gt / np.sum(pos_gt) pos_pred = np.sum(self._conf_matrix[:-1, :-1], axis=1).astype(np.float) acc_valid = pos_gt > 0 acc[acc_valid] = tp[acc_valid] / pos_gt[acc_valid] iou_valid = (pos_gt + pos_pred) > 0 union = pos_gt + pos_pred - tp iou[acc_valid] = tp[acc_valid] / union[acc_valid] macc = np.sum(acc[acc_valid]) / np.sum(acc_valid) miou = np.sum(iou[acc_valid]) / np.sum(iou_valid) fiou = np.sum(iou[acc_valid] * class_weights[acc_valid]) pacc = np.sum(tp) / np.sum(pos_gt) res = {} res["mIoU"] = 100 * miou res["fwIoU"] = 100 * fiou for i, name in enumerate(self._class_names): res["IoU-{}".format(name)] = 100 * iou[i] res["mACC"] = 100 * macc res["pACC"] = 100 * pacc for i, name in enumerate(self._class_names): res["ACC-{}".format(name)] = 100 * acc[i] if self._output_dir: file_path = os.path.join(self._output_dir, "sem_seg_evaluation.pth") with PathManager.open(file_path, "wb") as f: torch.save(res, f) results = OrderedDict({"sem_seg": res}) self._logger.info(results) return results
def evaluate(self, img_ids=None): """ Args: img_ids: a list of image IDs to evaluate on. Default to None for the whole dataset """ if self._distributed: comm.synchronize() predictions = comm.gather(self._predictions, dst=0) predictions = list(itertools.chain(*predictions)) if not comm.is_main_process(): return {} else: predictions = self._predictions if len(predictions) == 0: self._logger.warning("[COCOEvaluator] Did not receive valid predictions.") return {} if self._output_dir: PathManager.mkdirs(self._output_dir) file_path = os.path.join(self._output_dir, "instances_predictions.pth") with PathManager.open(file_path, "wb") as f: torch.save(predictions, f) self._results = OrderedDict() if "proposals" in predictions[0]: self._eval_box_proposals(predictions) if "instances" in predictions[0]: self._eval_predictions(predictions, img_ids=img_ids) # Copy so the caller can do whatever with results return copy.deepcopy(self._results)
def _do_eval(self): results = self._func() logger = logging.getLogger(__name__) if results: assert isinstance( results, dict ), "Eval function must return a dict. Got {} instead.".format( results) flattened_results = flatten_results_dict(results) valid = dict() for k, v in flattened_results.items(): try: valid[k] = float(v) # currently only support skipping (List, Tensor, numpy.nda) # TODO: Maybe other types of Exceptions need to be taken into consideration except (ValueError, TypeError): logger.info("Skip put {}: {} to tensorboard".format( k, type(v))) self.trainer.storage.put_scalars(**valid, smoothing_hint=False) # Evaluation may take different time among workers. # A barrier make them start the next iteration together. comm.synchronize()
def _do_eval(self): results = self._func() if results: assert isinstance( results, dict ), "Eval function must return a dict. Got {} instead.".format( results) flattened_results = flatten_results_dict(results) for k, v in flattened_results.items(): try: v = float(v) except Exception as e: raise ValueError( "[EvalHook] eval_function should return a nested dict of float. " "Got '{}: {}' instead.".format(k, v)) from e self.trainer.storage.put_scalars(**flattened_results, smoothing_hint=False) if comm.is_main_process() and results: # save evaluation results in json is_final = self.trainer.iter + 1 >= self.trainer.max_iter os.makedirs(os.path.join(self.cfg.OUTPUT_DIR, 'inference'), exist_ok=True) output_file = 'res_final.json' if is_final else \ 'iter_{:07d}.json'.format(self.trainer.iter) with PathManager.open( os.path.join(self.cfg.OUTPUT_DIR, 'inference', output_file), 'w') as fp: json.dump(results, fp) # Evaluation may take different time among workers. # A barrier make them start the next iteration together. comm.synchronize()
def run_step(self): """ Implement the standard training logic described above. """ assert self.model.training, "[SimpleTrainer] model was changed to eval mode!" start = time.perf_counter() """ If you want to do something with the data, you can wrap the dataloader. """ classifier_data = next(self._data_loader_iter) base_data = None meta_data = None data_time = time.perf_counter() - start loss_dict = self.model(base_data, weak_batched_inputs=classifier_data, meta_data=meta_data) losses = sum(loss_dict.values()) self._detect_anomaly(losses, loss_dict) self.optimizer.zero_grad() losses.backward() self.optimizer.step() comm.synchronize() metrics_dict = loss_dict metrics_dict["data_time"] = data_time self._write_metrics(metrics_dict)
def evaluate(self): # Se ejecuta una vez que están todas las predicciones if self._distributed: comm.synchronize() predictions = comm.gather(self._predictions, dst=0) predictions = list(itertools.chain(*predictions)) if not comm.is_main_process(): return {} else: predictions = self._predictions if len(predictions) == 0: self._logger.warning( "[COCOEvaluator] Did not receive valid predictions.") return {} if self._output_dir: PathManager.mkdirs(self._output_dir) file_path = os.path.join(self._output_dir, "instances_predictions.pth") with PathManager.open(file_path, "wb") as f: torch.save(predictions, f) self._results = OrderedDict() if "proposals" in predictions[0]: self._eval_box_proposals(predictions) if "instances" in predictions[0]: self._eval_predictions(set(self._tasks), predictions) # Copy so the caller can do whatever with results return copy.deepcopy(self._results)
def raw_to_detectron(data_path: Path, remove_cache: bool, cfg: CfgNode): data_splits = ['val'] data_splits += ['train'] if not cfg.DEBUG else [] for name in data_splits: coco_path = Path('.') / 'tmp' / ('coco_' + name + '.json') if (remove_cache or not coco_path.exists()) and comm.is_main_process(): input_files = [a for a in (data_path / name / 'inputs').iterdir()] mask_ext = next( (data_path / name / 'masks').iterdir()).name.split('.')[1] mask_files = [ a.parent.parent / 'masks' / (a.name.split('.')[0] + '.' + mask_ext) for a in input_files ] shutil.rmtree(coco_path, ignore_errors=True) coco_path.parent.mkdir(parents=True, exist_ok=True) frame_objects = array_apply(process_frame, zip(input_files, mask_files, repeat(cfg.MIN_AREA)), parallel=not DEBUG, total=len(input_files), chunksize=1000) write_serialized(frame_objects, coco_path) DatasetCatalog.register(name, lambda d=coco_path: get_data_dicts(d)) MetadataCatalog.get(name).set(thing_classes=['object']) comm.synchronize()
def evaluate(self): if self._distributed: comm.synchronize() predictions = comm.gather(self._predictions, dst=0) predictions = list(itertools.chain(*predictions)) if not comm.is_main_process(): return {} else: predictions = self._predictions image_contains_mixed_unknowns = [ prediction['image_contains_mixed_unknowns'] for prediction in predictions ] scores = [prediction['scores'] for prediction in predictions] correct = [prediction['correct'] for prediction in predictions] pred_classes = [ prediction['pred_classes'] for prediction in predictions ] category_counts = {} for category in self._coco_api.cats: if category not in category_counts: category_counts[self.internal_dataset_mapping[category]] = 0 category_counts[self.internal_dataset_mapping[category]] += len( self._coco_api.getAnnIds(catIds=[category])) return dict(predictions=dict( image_contains_mixed_unknowns=image_contains_mixed_unknowns, scores=scores, correct=correct, pred_classes=pred_classes), category_counts=category_counts)
def evaluate(self): """ Returns: In detectron2.tools.train_net.py, following format expected: dict: * key: the name of the task (e.g., bbox) * value: a dict of {metric name: score}, e.g.: {"AP50": 80} """ if self._distributed: comm.synchronize() prediction_counts = comm.gather(self.prediction_counts, dst=0) prediction_counts = list(itertools.chain(*prediction_counts)) confidence_scores = comm.gather(self.confidence_scores, dst=0) confidence_scores = list(itertools.chain(*confidence_scores)) if not comm.is_main_process(): return {} else: prediction_counts = self.prediction_counts confidence_scores = self.confidence_scores mpi = np.mean(prediction_counts) mcp = np.mean(confidence_scores) output_metrics = OrderedDict({ "false_positives": { "predictions_per_image": mpi, "confidence_per_prediction": mcp, } }) logger.info(f"mean predictions per image: {mpi}") logger.info(f"mean confidence per prediction: {mcp}") return output_metrics
def evaluate(self): if self._distributed: synchronize() endpoint_errors = all_gather(self._endpoint_errors) endpoint_errors = [per_image for per_gpu in endpoint_errors for per_image in per_gpu] self._predictions = all_gather(self._predictions) if not is_main_process(): return if self._output_dir: PathManager.mkdirs(self._output_dir) file_path = os.path.join(self._output_dir, "flow_predictions.json") with PathManager.open(file_path, "w") as f: f.write(json.dumps(self._predictions)) ave_epe = sum(endpoint_errors) / len(endpoint_errors) res = {"ave_epe": ave_epe} if self._output_dir: file_path = os.path.join(self._output_dir, "flow_evaluation.pth") with PathManager.open(file_path, "wb") as f: torch.save(res, f) results = OrderedDict({"flow": res}) small_table = create_small_table(res) self._logger.info("Evaluation results for flow: \n" + small_table) dump_info_one_task = { "task": "flow", "tables": [small_table], } _dump_to_markdown([dump_info_one_task]) return results
def get_avg_losses(self, ): if self._distributed: synchronize() self._losses = all_gather(self._losses) if not is_main_process(): return all_losses = {} for p in self._losses: all_losses.update(p) else: all_losses = self._losses image_unique_ids = list(all_losses.keys()) loss_keys = list(all_losses[image_unique_ids[0]].keys()) losses_global_avg = {} for key in loss_keys: losses_global_avg[key] = [] for img_spec_id in image_unique_ids: loss_sig = all_losses[img_spec_id] for key in loss_keys: losses_global_avg[key].append(loss_sig[key]) for key in loss_keys: losses_global_avg[key] = np.array(losses_global_avg[key]).mean() global_loss = OrderedDict(losses_global_avg) return global_loss
def after_step(self): next_iter = self.trainer.iter + 1 is_final = next_iter == self.trainer.max_iter if is_final or (self._period > 0 and next_iter % self._period == 0): results = self._func() if results: assert isinstance( results, dict ), "Eval function must return a dict. Got {} instead.".format( results) flattened_results = flatten_results_dict(results) for k, v in flattened_results.items(): try: v = float(v) except Exception: raise ValueError( "[EvalHook] eval_function should return a nested dict of float. " "Got '{}: {}' instead.".format(k, v)) self.trainer.storage.put_scalars(**flattened_results, smoothing_hint=False) # Evaluation may take different time among workers. # A barrier make them start the next iteration together. comm.synchronize()
def train_func(self, data, iteration, pbar): images, labels = self.preprocess_image(data) images = images.tensor bs = len(images) batched_arcs = get_ddp_attr(self.controller, 'get_sampled_arc')(bs=bs) self.gan_model(images=images, labels=labels, z=self.z_train, iteration=iteration, batched_arcs=batched_arcs) if iteration % self.train_controller_every_iter == 0: get_ddp_attr(self.controller, 'train_controller')( G=self.G, z=self.z_train, y=self.y_train, controller=self.controller, controller_optim=self.controller_optim, iteration=iteration, pbar=pbar) # Just for monitoring the training processing sampled_arc = get_ddp_attr(self.controller, 'get_sampled_arc')() sampled_arc = self.get_tensor_of_main_processing(sampled_arc) classes_arcs = sampled_arc[[ 0, ], ].repeat(self.n_classes, 1) self.evaluate_model(classes_arcs=classes_arcs, iteration=iteration) comm.synchronize()
def benchmark_distributed(self, num_iter, warmup=10): """ Benchmark the dataloader in each distributed worker, and log results of all workers. This helps understand the final performance as well as the variances among workers. It also prints startup time (first iter) of the dataloader. """ gpu = comm.get_world_size() dataset = MapDataset(self.dataset, self.mapper) n = self.num_workers loader = build_batch_data_loader(dataset, self.sampler, self.total_batch_size, num_workers=n) timer = Timer() loader = iter(loader) next(loader) startup_time = timer.seconds() logger.info( "Dataloader startup time: {:.2f} seconds".format(startup_time)) comm.synchronize() avg, all_times = self._benchmark(loader, num_iter * max(n, 1), warmup * max(n, 1)) del loader self._log_time( f"DataLoader ({gpu} GPUs x {n} workers, total bs={self.total_batch_size})", avg, all_times, True, )
def train_func(self, data, iteration, pbar): classes_arcs = self.arcs.repeat(self.n_classes, 1) self.evaluate_model(classes_arcs=classes_arcs, iteration=iteration) comm.synchronize() exit(-1)
def _distributed_worker(local_rank, main_func, world_size, num_gpus_per_machine, machine_rank, dist_url, args): assert torch.cuda.is_available( ), "cuda is not available. Please check your installation." global_rank = machine_rank * num_gpus_per_machine + local_rank try: dist.init_process_group(backend="NCCL", init_method=dist_url, world_size=world_size, rank=global_rank) except Exception as e: logger = logging.getLogger(__name__) logger.error("Process group URL: {}".format(dist_url)) raise e # synchronize is needed here to prevent a possible timeout after calling init_process_group # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172 comm.synchronize() assert num_gpus_per_machine <= torch.cuda.device_count() torch.cuda.set_device(local_rank) # Setup the local process group (which contains ranks within the same machine) assert comm._LOCAL_PROCESS_GROUP is None num_machines = world_size // num_gpus_per_machine for i in range(num_machines): ranks_on_i = list( range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine)) pg = dist.new_group(ranks_on_i) if i == machine_rank: comm._LOCAL_PROCESS_GROUP = pg main_func(*args)
def _do_loss_eval(self): # Copying inference_on_dataset from evaluator.py total = len(self._data_loader) num_warmup = min(5, total - 1) start_time = time.perf_counter() total_compute_time = 0 losses = [] for idx, inputs in enumerate(self._data_loader): if idx == num_warmup: start_time = time.perf_counter() total_compute_time = 0 start_compute_time = time.perf_counter() if torch.cuda.is_available(): torch.cuda.synchronize() total_compute_time += time.perf_counter() - start_compute_time iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup) seconds_per_img = total_compute_time / iters_after_start if idx >= num_warmup * 2 or seconds_per_img > 5: total_seconds_per_img = (time.perf_counter() - start_time) / iters_after_start eta = datetime.timedelta(seconds=int(total_seconds_per_img * (total - idx - 1))) log_every_n_seconds( logging.INFO, "Loss on Validation done {}/{}. {:.4f} s / img. ETA={}".format( idx + 1, total, seconds_per_img, str(eta) ), n=5, ) loss_batch = self._get_loss(inputs) losses.append(loss_batch) mean_loss = np.mean(losses) self.trainer.storage.put_scalar('validation_loss', mean_loss) comm.synchronize() return losses
def evaluate(self): if self._distributed: comm.synchronize() self._predictions = comm.gather(self._predictions, dst=0) self._predictions = list(itertools.chain(*self._predictions)) self.submit_results = comm.gather(self.submit_results, dst=0) self.submit_results = list(itertools.chain(*self.submit_results)) if not comm.is_main_process(): return {} if len(self._predictions) == 0: self._logger.warning( "[COCOEvaluator] Did not receive valid predictions.") return {} self._logger.info("Preparing results for COCO format ...") self._coco_results = list( itertools.chain(*[x["instances"] for x in self._predictions])) if self._output_dir: res_file = os.path.join(self._output_dir, "crowdhuman_evaluate_results.json") self._logger.info("Saving results to {}".format(res_file)) with PathManager.open(res_file, "w") as f: f.write(json.dumps(self._coco_results)) f.flush() self._logger.info("Saving results to {}".format(res_file)) submit_file = os.path.join(self._output_dir, "submission.txt") with PathManager.open(submit_file, "w") as f: for result in self.submit_results: f.write(json.dumps(result)) f.write("\n") f.flush() self._logger.info("Evaluating predictions ...") metrics = ["ALL"] results = {} ret_results = OrderedDict() for gt_json in [self._metadata.gt_file]: name = gt_json.split("/")[-1].split(".")[0] for id_setup in range(len(metrics)): cocoGt = COCO(gt_json) cocoDt = cocoGt.loadRes(res_file) imgIds = sorted(cocoGt.getImgIds()) cocoEval = CrowdHumanEval(cocoGt, cocoDt, "bbox") cocoEval.params.imgIds = imgIds cocoEval.evaluate(id_setup) cocoEval.accumulate() performance_dict = cocoEval.summarize(id_setup) for key in performance_dict.keys(): results[name + " " + key] = performance_dict[key] self._logger.info( "Evaluation results for Pedestrian Detection on CrowdHuman: \n" + create_small_table(results)) ret_results["PedestrianDetection"] = copy.deepcopy(results) return ret_results
def evaluate_loss(self, cfg, model): """Compute and log the validation loss to Comet Args: cfg (CfgNode): Detectron Config Object model (torch.nn.Module): Detectron Model Returns: dict: Empty Dict to satisfy Detectron Eval Hook API requirements """ eval_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0], DatasetMapper(cfg, True)) # Copying inference_on_dataset from evaluator.py total = len(eval_loader) num_warmup = min(5, total - 1) start_time = time.perf_counter() total_compute_time = 0 losses = [] if comm.is_main_process(): storage = get_event_storage() for idx, inputs in enumerate(eval_loader): if idx == num_warmup: start_time = time.perf_counter() total_compute_time = 0 start_compute_time = time.perf_counter() if torch.cuda.is_available(): torch.cuda.synchronize() total_compute_time += time.perf_counter() - start_compute_time iters_after_start = idx + 1 - num_warmup * int( idx >= num_warmup) seconds_per_img = total_compute_time / iters_after_start if idx >= num_warmup * 2 or seconds_per_img > 5: total_seconds_per_img = (time.perf_counter() - start_time) / iters_after_start eta = datetime.timedelta( seconds=int(total_seconds_per_img * (total - idx - 1))) log_every_n_seconds( logging.INFO, "Loss on Validation done {}/{}. {:.4f} s / img. ETA={}" .format(idx + 1, total, seconds_per_img, str(eta)), n=5, ) loss_batch = self._get_loss(model, inputs) losses.append(loss_batch) mean_loss = np.mean(losses) # Log to Comet self.experiment.log_metric("eval_loss", mean_loss) storage.put_scalar("eval_loss", mean_loss) comm.synchronize() # Returns empty dict to satisfy Dectron Eval Hook requirement return {}
def _do_loss_eval(self) -> float: """ Evaluate the loss function on the validation set. Returns: mean_loss (float): Value of the loss. """ # Copying inference_on_dataset from evaluator.py num_samples: int = len(self._data_loader) self._logger.info("Starting validation on %d samples", num_samples) num_warmup: int = min(5, num_samples - 1) start_time: float = time.perf_counter() total_compute_time: float = 0 losses: List[float] = [] for idx, inputs in enumerate(self._data_loader): if idx == num_warmup: start_time = time.perf_counter() total_compute_time = 0 # Inference for these inputs start_compute_time: float = time.perf_counter() loss_batch: float = self._get_loss(inputs) losses.append(loss_batch) if torch.cuda.is_available(): torch.cuda.synchronize() total_compute_time += time.perf_counter() - start_compute_time iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup) seconds_per_img = total_compute_time / iters_after_start if idx >= num_warmup * 2 or seconds_per_img > 5: # Compute average time spent on each image. total_seconds_per_img = (time.perf_counter() - start_time) / iters_after_start # Compute ETA eta = datetime.timedelta( seconds=int(total_seconds_per_img * (num_samples - idx - 1))) log_every_n_seconds(lvl=logging.INFO, msg=f"Loss on Validation done {idx + 1}/{num_samples}."\ f" {seconds_per_img:.4f} s / img. ETA={eta}", n=100, name=__name__) # Average the losses. mean_loss = np.mean(losses) # Print the loss value. self._logger.info("Validation loss : {mean_loss}") # Store the loss value for it to be logged and displayed in TensorBoard. self.trainer.storage.put_scalar('validation_loss', mean_loss) comm.synchronize() return mean_loss
def do_train(cfg, model, resume=False): model.train() optimizer = build_optimizer(cfg, model) scheduler = build_lr_scheduler(cfg, optimizer) checkpointer = DetectionCheckpointer( model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler ) start_iter = ( checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1 ) max_iter = cfg.SOLVER.MAX_ITER periodic_checkpointer = PeriodicCheckpointer( checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter ) writers = default_writers(cfg.OUTPUT_DIR, max_iter) if comm.is_main_process() else [] # compared to "train_net.py", we do not support accurate timing and # precise BN here, because they are not trivial to implement in a small training loop data_loader = build_detection_train_loader(cfg) logger.info("Starting training from iteration {}".format(start_iter)) with EventStorage(start_iter) as storage: for data, iteration in zip(data_loader, range(start_iter, max_iter)): storage.iter = iteration loss_dict = model(data) losses = sum(loss_dict.values()) assert torch.isfinite(losses).all(), loss_dict loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()} losses_reduced = sum(loss for loss in loss_dict_reduced.values()) if comm.is_main_process(): storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() losses.backward() optimizer.step() storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) scheduler.step() if ( cfg.TEST.EVAL_PERIOD > 0 and (iteration + 1) % cfg.TEST.EVAL_PERIOD == 0 and iteration != max_iter - 1 ): do_test(cfg, model) # Compared to "train_net.py", the test results are not dumped to EventStorage comm.synchronize() if iteration - start_iter > 5 and ( (iteration + 1) % 20 == 0 or iteration == max_iter - 1 ): for writer in writers: writer.write() periodic_checkpointer.step(iteration)
def evaluate(self): if self._distributed: synchronize() self._predictions = all_gather(self._predictions) self._predictions = list(itertools.chain(*self._predictions)) if not is_main_process(): return return copy.deepcopy(self._eval_predictions())
def distributed_worker(local_rank, main_func, nprocs, dist_url, args): dist.init_process_group(backend="gloo", init_method=dist_url, world_size=nprocs, rank=local_rank) comm.synchronize() assert comm._LOCAL_PROCESS_GROUP is None pg = dist.new_group(list(range(nprocs))) comm._LOCAL_PROCESS_GROUP = pg main_func(*args)