def do_evaluation(cfg, model, output_dir, distributed): if isinstance(model, torch.nn.parallel.DistributedDataParallel): model = model.module assert isinstance(model, SSD), 'Wrong module.' test_datasets = build_dataset(dataset_list=cfg.DATASETS.TEST, is_test=True) device = torch.device(cfg.MODEL.DEVICE) model.eval() if not model.is_test: model.is_test = True predictor = Predictor(cfg=cfg, model=model, iou_threshold=cfg.TEST.NMS_THRESHOLD, score_threshold=cfg.TEST.CONFIDENCE_THRESHOLD, device=device) cpu_device = torch.device("cpu") logger = logging.getLogger("SSD.inference") for dataset_name, test_dataset in zip(cfg.DATASETS.TEST, test_datasets): logger.info("Test dataset {} size: {}".format(dataset_name, len(test_dataset))) indices = list(range(len(test_dataset))) if distributed: indices = indices[distributed_util.get_rank()::distributed_util. get_world_size()] # show progress bar only on main process. progress_bar = tqdm if distributed_util.is_main_process() else iter logger.info('Progress on {} 0:'.format(cfg.MODEL.DEVICE.upper())) predictions = {} for i in progress_bar(indices): image = test_dataset.get_image(i) output = predictor.predict(image) boxes, labels, scores = [o.to(cpu_device).numpy() for o in output] predictions[i] = (boxes, labels, scores) distributed_util.synchronize() predictions = _accumulate_predictions_from_multiple_gpus(predictions) if not distributed_util.is_main_process(): return final_output_dir = os.path.join(output_dir, dataset_name) if not os.path.exists(final_output_dir): os.makedirs(final_output_dir) torch.save(predictions, os.path.join(final_output_dir, 'predictions.pth')) evaluate(dataset=test_dataset, predictions=predictions, output_dir=final_output_dir)
def _evaluation(cfg, dataset_name, test_dataset, predictor, distributed, output_dir): """ Perform evaluating on one dataset Args: cfg: dataset_name: dataset's name test_dataset: Dataset object predictor: Predictor object, used to to prediction. distributed: whether distributed evaluating or not output_dir: path to save prediction results Returns: evaluate result """ cpu_device = torch.device("cpu") logger = logging.getLogger("SSD.inference") logger.info("Evaluating {} dataset({} images):".format(dataset_name, len(test_dataset))) indices = list(range(len(test_dataset))) if distributed: indices = indices[distributed_util.get_rank()::distributed_util.get_world_size()] # show progress bar only on main process. progress_bar = tqdm if distributed_util.is_main_process() else iter logger.info('Progress on {} 0:'.format(cfg.MODEL.DEVICE.upper())) predictions = {} for i in progress_bar(indices): image = test_dataset.get_image(i) #print(type(image)) #image=numpy(image) #transform=PredictionTransform(cfg.INPUT.IMAGE_SIZE, cfg.INPUT.PIXEL_MEAN) #image=transform(image) output = predictor.predict(image) print('output') boxes, labels, scores = [o.to(cpu_device).numpy() for o in output] predictions[i] = (boxes, labels, scores) distributed_util.synchronize() predictions = _accumulate_predictions_from_multiple_gpus(predictions) if not distributed_util.is_main_process(): return final_output_dir = os.path.join(output_dir, dataset_name) if not os.path.exists(final_output_dir): os.makedirs(final_output_dir) torch.save(predictions, os.path.join(final_output_dir, 'predictions.pth')) return evaluate(dataset=test_dataset, predictions=predictions, output_dir=final_output_dir)
def _accumulate_predictions_from_multiple_gpus(predictions_per_gpu): all_predictions = distributed_util.all_gather(predictions_per_gpu) if not distributed_util.is_main_process(): return # merge the list of dicts predictions = {} for p in all_predictions: predictions.update(p) # convert a dict where the key is the index in a list image_ids = list(sorted(predictions.keys())) if len(image_ids) != image_ids[-1] + 1: logger = logging.getLogger("SSD.inference") logger.warning( "Number of images that were gathered from multiple processes is not " "a contiguous set. Some images might be missing from the evaluation" ) # convert to a list predictions = [predictions[i] for i in image_ids] return predictions
def do_train(cfg, model, data_loader, optimizer, scheduler, device, args, val_sets_dict=None): logger = logging.getLogger("SSD.trainer") logger.info("Start training") model.train() save_to_disk = distributed_util.get_rank() == 0 if args.use_tensorboard and save_to_disk: import tensorboardX summary_writer = tensorboardX.SummaryWriter(log_dir=cfg.OUTPUT_DIR) tf_writer = tf.compat.v1.summary.FileWriter(cfg.OUTPUT_DIR) else: summary_writer = None if cfg.DATASETS.DG: dataloaders = data_loader max_iter = len(data_loader[0]) dataiters = [iter(dataloader) for dataloader in dataloaders] else: max_iter = len(data_loader) data_loader = iter(data_loader) start_training_time = time.time() trained_time = 0 tic = time.time() end = time.time() if args.return_best: best_map = 0 for iteration in range(scheduler.last_epoch, max_iter): if cfg.DATASETS.DG: # domain generalization settings # we need to read images from different sources images = torch.ones(cfg.SOLVER.BATCH_SIZE * len(dataloaders), 3, cfg.INPUT.IMAGE_SIZE, cfg.INPUT.IMAGE_SIZE) for j in range(len(dataloaders)): if cfg.MODEL.SELF_SUPERVISED: d_images, d_boxes, d_labels, d_j_images, d_j_index, d_orig_boxes, d_orig_labels = next(dataiters[j]) else: d_images, d_boxes, d_labels, d_orig_boxes, d_orig_labels = next(dataiters[j]) start_bs = cfg.SOLVER.BATCH_SIZE * j end_bs = start_bs + cfg.SOLVER.BATCH_SIZE images[start_bs:end_bs, :, :, :] = d_images if j == 0: boxes = d_boxes labels = d_labels orig_boxes = d_orig_boxes orig_labels = d_orig_labels if cfg.MODEL.SELF_SUPERVISED: j_images = d_j_images j_index = d_j_index else: boxes = torch.cat((boxes, d_boxes)) labels = torch.cat((labels, d_labels)) orig_boxes = torch.cat((orig_boxes, d_orig_boxes)) orig_labels = torch.cat((orig_labels, d_orig_labels)) if cfg.MODEL.SELF_SUPERVISED: j_images = torch.cat((j_images, d_j_images)) j_index = torch.cat((j_index, d_j_index)) else: if cfg.MODEL.SELF_SUPERVISED: images, boxes, labels, j_images, j_index, orig_boxes, orig_labels = next(data_loader) else: images, boxes, labels, orig_boxes, orig_labels = next(data_loader) # it is not a problem if we increment iteration because it will be reset in the loop iteration = iteration + 1 images = images.to(device) boxes = boxes.to(device) labels = labels.to(device) optimizer.zero_grad() loss_dict = model(images, targets=(boxes, labels)) # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss = sum(loss for loss in loss_dict.values()) # loss.backward() becomes: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() if cfg.MODEL.SELF_SUPERVISED: j_images = j_images.to(device) j_index = j_index.to(device) loss_dict_j = model(j_images, targets=j_index, auxiliary_task=True) loss_dict_reduced_j = reduce_loss_dict(loss_dict_j) losses_reduced_j = sum(loss for loss in loss_dict_reduced_j.values()) loss_j = sum(loss for loss in loss_dict_j.values()) # apply reduction factor for auxiliary loss loss_j = loss_j * cfg.MODEL.SELF_SUPERVISOR.WEIGHT # loss.backward() becomes: with amp.scale_loss(loss_j, optimizer) as scaled_loss: scaled_loss.backward() # append this loss to the dictionary of losses loss_dict.update(loss_dict_j) losses_reduced += losses_reduced_j optimizer.step() scheduler.step() trained_time += time.time() - end end = time.time() if iteration % args.log_step == 0: eta_seconds = int((trained_time / iteration) * (max_iter - iteration)) log_str = [ "Iter: {:06d}, Lr: {:.5f}, Cost: {:.2f}s, Eta: {}".format(iteration, optimizer.param_groups[0]['lr'], time.time() - tic, str(datetime.timedelta(seconds=eta_seconds))), "total_loss: {:.3f}".format(losses_reduced.item()) ] for loss_name, loss_item in loss_dict_reduced.items(): log_str.append("{}: {:.3f}".format(loss_name, loss_item.item())) log_str = ', '.join(log_str) logger.info(log_str) if summary_writer: global_step = iteration summary_writer.add_scalar('losses/total_loss', losses_reduced, global_step=global_step) for loss_name, loss_item in loss_dict_reduced.items(): summary_writer.add_scalar('losses/{}'.format(loss_name), loss_item, global_step=global_step) summary_writer.add_scalar('lr', optimizer.param_groups[0]['lr'], global_step=global_step) if cfg.MODEL.SELF_SUPERVISED: _log_images_tensorboard(cfg, global_step, images, orig_boxes, orig_labels, summary_writer, j_images=j_images) else: _log_images_tensorboard(cfg, global_step, images, orig_boxes, orig_labels, summary_writer) #for tag, value in model.named_parameters(): # tag = tag.replace('.', '/') # if 'ss_classifier' in tag: # print(tag, value) #_log_network_params(tf_writer, model, global_step) tic = time.time() if save_to_disk and iteration % args.save_step == 0: model_path = os.path.join(cfg.OUTPUT_DIR, "ssd{}_vgg_iteration_{:06d}.pth".format(cfg.INPUT.IMAGE_SIZE, iteration)) save_training_checkpoint(logger, model, scheduler, optimizer, model_path) # Do eval when training, to trace the mAP changes and see whether or not performance improved # if args.return_best = True the model returned should be the one that gave best performances on the val set if args.eval_step > 0 and iteration % args.eval_step == 0 and (not iteration == max_iter or args.return_best): dataset_metrics = do_evaluation(cfg, model, cfg.OUTPUT_DIR, distributed=args.distributed, datasets_dict=val_sets_dict) model.train() if args.distributed and not distributed_util.is_main_process(): continue avg_map = _compute_avg_map(dataset_metrics) if args.return_best: if avg_map > best_map: best_map = avg_map logger.info("With iteration {} passed the best! New best avg map: {:4f}".format(iteration, best_map)) model_path = os.path.join(cfg.OUTPUT_DIR, "ssd{}_vgg_best.pth".format(cfg.INPUT.IMAGE_SIZE)) _save_model(logger, model, model_path) else: logger.info("With iteration {} the best has not been reached. Best avg map: {:4f}, Current avg mAP: {:4f}".format(iteration, best_map, avg_map)) # logging if summary_writer: global_step = iteration summary_writer.add_scalar("val_avg_map", avg_map, global_step=global_step) for dataset_name, metrics in dataset_metrics.items(): for metric_name, metric_value in metrics.get_printable_metrics().items(): summary_writer.add_scalar('/'.join(['val', dataset_name, metric_name]), metric_value, global_step=global_step) if save_to_disk: model_path = os.path.join(cfg.OUTPUT_DIR, "ssd{}_vgg_final.pth".format(cfg.INPUT.IMAGE_SIZE)) _save_model(logger, model, model_path) # compute training time total_training_time = int(time.time() - start_training_time) total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format(total_time_str, total_training_time / max_iter)) if args.return_best: model.load(os.path.join(cfg.OUTPUT_DIR, "ssd{}_vgg_best.pth".format(cfg.INPUT.IMAGE_SIZE))) return model
def main(): parser = argparse.ArgumentParser( description='Single Shot MultiBox Detector Training With PyTorch') parser.add_argument( "--config-file", default="", metavar="FILE", help="path to config file", type=str, ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( '--vgg', help= 'Pre-trained vgg model path, download from https://s3.amazonaws.com/amdegroot-models/vgg16_reducedfc.pth' ) parser.add_argument( '--resume', default=None, type=str, help='Checkpoint state_dict file to resume training from') parser.add_argument('--log_step', default=50, type=int, help='Print logs every log_step') parser.add_argument('--save_step', default=5000, type=int, help='Save checkpoint every save_step') parser.add_argument( '--eval_step', default=0, type=int, help= 'Evaluate dataset every eval_step, disabled when eval_step <= 0. Default: disabled' ) parser.add_argument('--use_tensorboard', default=True, type=str2bool) parser.add_argument("--num_workers", default=4, type=int, help="Number of workers to use for data loaders") parser.add_argument( "--eval_mode", default="test", type=str, help= 'Use defined test datasets for periodic evaluation or use a validation split. Default: "test", alternative "val"' ) parser.add_argument( "--return_best", default=False, type=str2bool, help= "If false (default) tests on the target the last model. If true tests on the target the model with the best performance on the validation set" ) parser.add_argument( "--skip-test", dest="skip_test", help="Do not test the final model", action="store_true", ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 args.distributed = num_gpus > 1 args.num_gpus = num_gpus if torch.cuda.is_available(): # This flag allows you to enable the inbuilt cudnn auto-tuner to # find the best algorithm to use for your hardware. torch.backends.cudnn.benchmark = True if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") logger = setup_logger("SSD", distributed_util.get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(args) cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) if not os.path.exists(cfg.OUTPUT_DIR): if not args.distributed or (args.distributed and distributed_util.is_main_process()): os.makedirs(cfg.OUTPUT_DIR) model = train(cfg, args) if not args.skip_test: logger.info('Start evaluating...') torch.cuda.empty_cache() # speed up evaluating after training finished do_evaluation(cfg, model, cfg.OUTPUT_DIR, distributed=args.distributed)