def reduce_loss_dict(loss_dict): """ Reduce the loss dictionary from all processes so that process with rank 0 has the averaged results. Returns a dict with the same fields as loss_dict, after reduction. """ world_size = get_world_size() if world_size < 2: return loss_dict with torch.no_grad(): loss_names = [] all_losses = [] for k, v in loss_dict.items(): loss_names.append(k) all_losses.append(v) all_losses = torch.stack(all_losses, dim=0) dist.reduce(all_losses, dst=0) if dist.get_rank() == 0: # only main process gets accumulated, so only divide by # world_size in this case all_losses /= world_size reduced_losses = {k: v for k, v in zip(loss_names, all_losses)} return reduced_losses
def reduce_loss_dict(loss_dict): """ Reduce the loss dictionary from all processes so that process with rank 0 has the averaged results. Returns a dict with the same fields as loss_dict, after reduction. """ world_size = get_world_size() if world_size < 2: return loss_dict with torch.no_grad(): loss_names = [] all_losses = [] for k in sorted(loss_dict.keys()): loss_names.append(k) all_losses.append(loss_dict[k]) all_losses = torch.stack(all_losses, dim=0) dist.reduce(all_losses, dst=0) if dist.get_rank() == 0: # only main process gets accumulated, so only divide by # world_size in this case all_losses /= world_size reduced_losses = {k: v for k, v in zip(loss_names, all_losses)} return reduced_losses
def run_test(cfg, model, distributed=False, test_mode="test", val_sets_dict=None): synchronize() model.eval() if distributed: model_orig = model model = model.module iou_types = ("bbox", ) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm", ) if cfg.MODEL.KEYPOINT_ON: iou_types = iou_types + ("keypoints", ) if test_mode == "test": dataset_names = cfg.DATASETS.TEST data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed) else: dataset_names = val_sets_dict.keys() data_loaders_val = [] # create data loaders for validation datasets num_gpus = get_world_size() images_per_batch = cfg.TEST.IMS_PER_BATCH assert ( images_per_batch % num_gpus == 0 ), "TEST.IMS_PER_BATCH ({}) must be divisible by the number of GPUs ({}) used.".format( images_per_batch, num_gpus) shuffle = False if not distributed else True images_per_gpu = images_per_batch // num_gpus num_iters = None start_iter = 0 aspect_grouping = [1] if cfg.DATALOADER.ASPECT_RATIO_GROUPING else [] val_transforms = None if cfg.TEST.BBOX_AUG.ENABLED else build_transforms( cfg, False) for k, ds in val_sets_dict.items(): ds.set_keep_difficult(True) ds.set_transforms(val_transforms) sampler = make_data_sampler(ds, shuffle, distributed) batch_sampler = make_batch_data_sampler(ds, sampler, aspect_grouping, images_per_gpu, num_iters, start_iter) collator = BatchCollator(cfg.DATALOADER.SIZE_DIVISIBILITY) num_workers = cfg.DATALOADER.NUM_WORKERS data_loader = torch.utils.data.DataLoader( ds, num_workers=num_workers, batch_sampler=batch_sampler, collate_fn=collator, ) data_loaders_val.append(data_loader) sum_mAPs = 0 for dataset_name, data_loader_val in zip(dataset_names, data_loaders_val): results = inference( model, data_loader_val, dataset_name=dataset_name, iou_types=iou_types, box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, cfg=cfg, ) synchronize() if distributed and not dist.get_rank() == 0: continue sum_mAPs += results["map"] if distributed: model = model_orig model.train() if test_mode == "val": train_transforms = build_transforms(cfg, True) for k, ds in val_sets_dict.items(): ds.set_keep_difficult(False) ds.set_transforms(train_transforms) return sum_mAPs / len(dataset_names)
def do_train( cfg, model, data_loader, data_loader_val, optimizer, scheduler, checkpointer, device, checkpoint_period, test_period, arguments, ): logger = logging.getLogger("maskrcnn_benchmark.trainer") logger.info("Start training") meters = MetricLogger(delimiter=" ") max_iter = len(data_loader) start_iter = arguments["iteration"] model.train() start_training_time = time.time() end = time.time() iou_types = ("bbox", ) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm", ) if cfg.MODEL.KEYPOINT_ON: iou_types = iou_types + ("keypoints", ) dataset_names = cfg.DATASETS.TEST for iteration, (images, targets, _) in enumerate(data_loader, start_iter): if any(len(target) < 1 for target in targets): logger.error( f"Iteration={iteration + 1} || Image Ids used for training {_} || targets Length={[len(target) for target in targets]}" ) continue data_time = time.time() - end iteration = iteration + 1 arguments["iteration"] = iteration images = images.to(device) targets = [target.to(device) for target in targets] loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) meters.update(loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() # Note: If mixed precision is not used, this ends up doing nothing # Otherwise apply loss scaling for mixed-precision recipe # with amp.scale_loss(losses, optimizer) as scaled_losses: # scaled_losses.backward() losses.backward() optimizer.step() scheduler.step() batch_time = time.time() - end end = time.time() meters.update(time=batch_time, data=data_time) eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) if iteration % 20 == 0 or iteration == max_iter: logger.info( meters.delimiter.join([ "eta: {eta}", "iter: {iter}", "{meters}", "lr: {lr:.6f}", "max mem: {memory:.0f}", ]).format( eta=eta_string, iter=iteration, meters=str(meters), lr=optimizer.param_groups[0]["lr"], memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, )) if iteration % checkpoint_period == 0: checkpointer.save("model_{:07d}".format(iteration), **arguments) if data_loader_val is not None and test_period > 0 and iteration % test_period == 0: meters_val = MetricLogger(delimiter=" ") synchronize() _ = inference( # The result can be used for additional logging, e. g. for TensorBoard model, # The method changes the segmentation mask format in a data loader, # so every time a new data loader is created: make_data_loader(cfg, is_train=False, is_distributed=(get_world_size() > 1), is_for_period=True), dataset_name="[Validation]", iou_types=iou_types, box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=None, ) synchronize() model.train() with torch.no_grad(): # Should be one image for each GPU: for iteration_val, (images_val, targets_val, _) in enumerate(tqdm(data_loader_val)): images_val = images_val.to(device) targets_val = [target.to(device) for target in targets_val] loss_dict = model(images_val, targets_val) losses = sum(loss for loss in loss_dict.values()) loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum( loss for loss in loss_dict_reduced.values()) meters_val.update(loss=losses_reduced, **loss_dict_reduced) synchronize() logger.info( meters_val.delimiter.join([ "[Validation]: ", "eta: {eta}", "iter: {iter}", "{meters}", "lr: {lr:.6f}", "max mem: {memory:.0f}", ]).format( eta=eta_string, iter=iteration, meters=str(meters_val), lr=optimizer.param_groups[0]["lr"], memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, )) if iteration == max_iter: checkpointer.save("model_final", **arguments) total_training_time = time.time() - start_training_time total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / (max_iter)))
def __init__(self, dataset, shuffle=True, distributed=False, num_replicas=None, rank=None, args=None, cfg=None): self.dataset = dataset # this is a list of list of names. # the first level correspond to a video, and the second # one to the names of the frames in the video self.video_data = dataset.video_data self.window_size = 1 self.batch_size_per_gpu = 1 self.epoch = 0 self.shuffle = shuffle self.distributed = distributed self.is_train = True if args is not None: if hasattr(args, 'window_size'): self.window_size = args.window_size if hasattr(args, 'is_train'): self.is_train = args.is_train if cfg is not None: self.batch_size_per_gpu = get_batch_size_per_gpu( cfg.SOLVER.IMS_PER_BATCH if self.is_train else cfg.TEST. IMS_PER_BATCH) self.indices = [] for video_id in sorted(self.video_data): frame_list = sorted(self.video_data[video_id]) count = 0 frame_ids = [] for frame_id in sorted(frame_list): frame_ids.append(frame_id) count += 1 if count == self.window_size: self.indices.append(frame_ids) frame_ids = [] count = 0 if not args.is_train and count > 0: for i in range(self.window_size): frame_ids.append(frame_id) count += 1 if count == self.window_size: self.indices.append(frame_ids) frame_ids = [] count = 0 break self.num_samples = len(self.indices) self.total_size = self.num_samples # print(self.__len__()) if self.distributed: if num_replicas is None: num_replicas = get_world_size() if rank is None: rank = get_rank() self.num_replicas = num_replicas self.rank = rank self.num_samples = int( math.ceil(self.num_samples * 1.0 / self.num_replicas)) self.total_size = self.num_samples * self.num_replicas
def do_train( cfg, model, data_loader_support, data_loader_query, data_loader_val_support, data_loader_val_test, optimizer, scheduler, checkpointer, device, checkpoint_period, test_period, arguments, meters, meters_val, ): logger = logging.getLogger("maskrcnn_benchmark.trainer") logger.info("Start training") # meters = MetricLogger(delimiter=" ") max_iter = len(data_loader_support) start_iter = arguments["iteration"] model.train() start_training_time = time.time() end = time.time() batch_cls_json_file = cfg.MODEL.FEW_SHOT.SUP_INDICE_CLS with open(batch_cls_json_file, 'r') as f: batch_cls_sup = json.load(f) if cfg.MODEL.QRY_BALANCE: qry_cls_json_file = cfg.MODEL.QRY_INDICE_CLS with open(qry_cls_json_file, 'r') as f: batch_cls_qry = json.load(f) iou_types = ("bbox",) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm",) if cfg.MODEL.KEYPOINT_ON: iou_types = iou_types + ("keypoints",) rank = dist.get_rank() # if is_main_process(): # import pdb # pdb.set_trace() # else: # return # for name, param in model. named_parameters(): # print(name, param, True if param.grad is not None else False) query_iterator = data_loader_query.__iter__() # print('len(data_loader_query):', len(data_loader_query)) # import pdb; pdb.set_trace() weights_novel_all = [] iteration_qry = 0 for iteration, (images_sup, targets_sup, idx) in enumerate(data_loader_support, start_iter): if any(len(target) < 1 for target in targets_sup): logger.error(f"Iteration={iteration + 1} || Image Ids used for training support {idx} || targets Length={[len(target) for target in targets_sup]}") continue data_time = time.time() - end batch_id = batch_cls_sup[rank][iteration] iteration = iteration + 1 arguments["iteration"] = iteration scheduler.step() images_sup = images_sup.to(device) targets_sup = [target.to(device) for target in targets_sup] # update weight: # print(targets_sup) # if is_main_process(): # import pdb # pdb.set_trace() # else: # return # print(iteration, idx, batch_id, targets_sup[0].extra_fields) weight_novel = model(images_sup, targets_sup, is_support=True, batch_id=batch_id) # weights_novel[rank] = weight_novel # print('batch_id', batch_id, weight_novel[:10]) # weight_novel = {batch_id:weight_novel} torch.cuda.empty_cache() # synchronize() weights_novel = [torch.empty_like(weight_novel) for i in range(dist.get_world_size())] weights_novel = torch.cat( diffdist.functional.all_gather(weights_novel, weight_novel)) # print(weights_novel[:,:10]) # if is_main_process(): # import pdb # pdb.set_trace() # else: # return weights_novel_all.append(weights_novel) # # print(weights_novel_all) # print(torch.cat(weights_novel_all).size()) # print(torch.cat(weights_novel_all)[:,:10]) # (torch.cat(gather_list) * torch.cat(gather_list)).mean().backward() # print(weights_novel) if iteration % iter_size == 0: optimizer.zero_grad() losses_reduced = 0 loss_dict_all = {} for i in range(iter_size_qry): images_qry, targets_qry, idx = query_iterator.next() images_qry = images_qry.to(device) targets_qry = [target.to(device) for target in targets_qry] if cfg.MODEL.QRY_BALANCE: batch_id_qry = batch_cls_qry[rank][iteration_qry] iteration_qry += 1 loss_dict = model(images_qry, targets_qry, is_query=True, batch_id=batch_id_qry, weights_novel=torch.cat(weights_novel_all)) else: loss_dict = model(images_qry, targets_qry, is_query=True, weights_novel=torch.cat(weights_novel_all)) # if is_main_process(): # print('loss_dict', loss_dict) losses = sum(loss for loss in loss_dict.values() ) / iter_size_qry # losses.backward(retain_graph=True) with amp.scale_loss(losses, optimizer) as scaled_losses: scaled_losses.backward(retain_graph=True) torch.cuda.empty_cache() loss_dict_all = add_dict(loss_dict_all, loss_dict) loss_dict_all = avg_dict(loss_dict_all) # if is_main_process(): # print('loss_dict_all', loss_dict_all) # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_loss_dict(loss_dict_all) # if is_main_process(): # print('loss_dict_reduced', loss_dict_reduced) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) # losses_dict_reduced = add_dict(losses_dict_reduced, loss_dict_reduced) meters.update(iteration / iter_size_qry, loss=losses_reduced, lr=optimizer.param_groups[0]["lr"], **loss_dict_reduced) weights_novel_all = [] # (weights_novel * weights_novel).mean().backward() # for name, param in model. named_parameters(): # if 'backbone' not in name: # print(name, True if param.grad is not None else False) optimizer.step() batch_time = time.time() - end end = time.time() meters.update(iteration, time=batch_time, data=data_time) eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) torch.cuda.empty_cache() if iteration % 20 == 0 or iteration == max_iter: logger.info( meters.delimiter.join( [ "eta: {eta}", "iter: {iter}", "{meters}", "lr: {lr:.6f}", "max mem: {memory:.0f}", ] ).format( eta=eta_string, iter=iteration, meters=str(meters), lr=optimizer.param_groups[0]["lr"], memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, ) ) if iteration % checkpoint_period == 0: checkpointer.save("model_{:07d}".format(iteration), **arguments) if data_loader_val_support is not None and test_period > 0 and iteration % test_period == 0: # meters_val = MetricLogger(delimiter=" ") synchronize() # """ model.train() with torch.no_grad(): weights_novel_val_sup_all = [] current_classifier_novel = torch.zeros( [iter_size * nGPU, 1024]).to(device) # print(current_classifier_novel) avg_steps = 0 for iteration_val_sup, (images_val_sup, targets_val_sup, idx_val_sup) in enumerate(tqdm(data_loader_val_support)): if any(len(target) < 1 for target in targets_val_sup): logger.error(f"Iteration={iteration + 1} || Image Ids used for training support {idx_val_sup} || targets Length={[len(target) for target in targets_val_sup]}") continue batch_id_val_sup = batch_cls_sup[rank][int( iteration_val_sup)] # print(iteration_val_sup) images_val_sup = images_val_sup.to(device) targets_val_sup = [target.to(device) for target in targets_val_sup] weight_novel_val_sup = model(images_val_sup, targets_val_sup, is_support=True, batch_id=batch_id_val_sup) # weights_novel[rank] = weight_novel_val_sup # print(weight_novel_val_sup.size()) # print('before', weight_novel_val_sup) # print('batch_id', batch_id, weight_novel_val_sup[:10]) # weight_novel_val_sup = {batch_id:weight_novel_val_sup} torch.cuda.empty_cache() # synchronize() weights_novel_val_sup = [torch.empty_like(weight_novel_val_sup) for i in range(dist.get_world_size())] dist.all_gather(weights_novel_val_sup, weight_novel_val_sup) # weights_novel_val_sup = torch.cat( # all_gather(weight_novel_val_sup)) # print('after', weights_novel_val_sup) # print(idx, weights_novel_val_sup) # print(weights_novel_val_sup[:,:10]) # if is_main_process(): # import pdb # pdb.set_trace() # else: # return weights_novel_val_sup_all.append( torch.cat(weights_novel_val_sup)) # print('length', len(weights_novel_val_sup_all)) if (iteration_val_sup + 1) % iter_size_qry == 0: # print(torch.cat(weights_novel_val_sup_all).size()) # weights_novel_val_sup_all = [] avg_steps += 1 # print('current_classifier_novel', current_classifier_novel) # print('weights_novel_val_sup_all', weights_novel_val_sup_all) current_classifier_novel = current_classifier_novel + \ torch.cat(weights_novel_val_sup_all) weights_novel_val_sup_all = [] # if is_main_process(): # import pdb # pdb.set_trace() # else: # return # print(iteration_val_sup) current_classifier_novel_avg = current_classifier_novel / avg_steps model.module.roi_heads.box.cls_weights = torch.cat([model.module.roi_heads.box.predictor.cls_score.weight, current_classifier_novel_avg]) # """ output_folder = os.path.join(cfg.OUTPUT_DIR, "Validation") mkdir(output_folder) np.save(os.path.join(output_folder, 'cls_weights_'+str(iteration / iter_size_qry)), np.array(model.module.roi_heads.box.cls_weights.cpu().data)) res_infer = inference( # The result can be used for additional logging, e. g. for TensorBoard model, iteration / iter_size, # The method changes the segmentation mask format in a data loader, # so every time a new data loader is created: make_data_loader(cfg, is_train=False, is_distributed=( get_world_size() > 1), is_for_period=True), dataset_name="[Validation]", iou_types=iou_types, box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=output_folder, ) # import pdb; pdb.set_trace() if res_infer: meters_val.update(iteration / iter_size, **res_infer) synchronize() # print('eval') model.train() """ with torch.no_grad(): # Should be one image for each GPU: for iteration_val, (images_val, targets_val, _) in enumerate(tqdm(data_loader_val_test)): images_val = images_val.to(device) targets_val = [target.to(device) for target in targets_val] loss_dict = model(images_val, targets_val) losses = sum(loss for loss in loss_dict.values()) loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum( loss for loss in loss_dict_reduced.values()) meters_val.update( iteration / iter_size, loss=losses_reduced, **loss_dict_reduced) """ synchronize() logger.info( meters_val.delimiter.join( [ "[Validation]: ", "eta: {eta}", "iter: {iter}", "{meters}", "lr: {lr:.6f}", "max mem: {memory:.0f}", ] ).format( eta=eta_string, iter=iteration / iter_size, meters=str(meters_val), lr=optimizer.param_groups[0]["lr"], memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, ) ) # """ if iteration == max_iter: checkpointer.save("model_final", **arguments) # import json # json.dump(model.module.roi_heads.box.cls_weights, open(os.path.join(output_folder, 'cls_weights.json'), 'w')) total_training_time = time.time() - start_training_time total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info( "Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / (max_iter) ) )
def make_cls_data_loader(cfg, is_train=True, domains=['clean'], is_distributed=False, start_iter=0): #if 'clean' in domains: # assert (len(domains)==1) num_gpus = get_world_size() if is_train: images_per_batch = cfg.SOLVER.IMS_PER_BATCH assert (images_per_batch % num_gpus == 0 ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number " "of GPUs ({}) used.".format(images_per_batch, num_gpus) images_per_gpu = images_per_batch // num_gpus if cfg.MODEL.DOMAIN_ADAPTATION_ON: assert ( images_per_batch % ((NUM_TARGET_DOMAINS + 1) * num_gpus) == 0 ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by {} times the number " "of GPUs ({}) used.".format(images_per_batch, NUM_TARGET_DOMAINS + 1, num_gpus) images_per_gpu = images_per_batch // ( (NUM_TARGET_DOMAINS + 1) * num_gpus) shuffle = True num_iters = cfg.SOLVER.MAX_ITER else: images_per_batch = cfg.TEST.IMS_PER_BATCH assert (images_per_batch % num_gpus == 0 ), "TEST.IMS_PER_BATCH ({}) must be divisible by the number " "of GPUs ({}) used.".format(images_per_batch, num_gpus) images_per_gpu = images_per_batch // num_gpus shuffle = False if not is_distributed else True num_iters = None start_iter = 0 if images_per_gpu > 1: logger = logging.getLogger(__name__) logger.warning( "When using more than one image per GPU you may encounter " "an out-of-memory (OOM) error if your GPU does not have " "sufficient memory. If this happens, you can reduce " "SOLVER.IMS_PER_BATCH (for training) or " "TEST.IMS_PER_BATCH (for inference). For training, you must " "also adjust the learning rate and schedule length according " "to the linear scaling rule. See for example: " "https://github.com/facebookresearch/Detectron/blob/master/configs/getting_started/tutorial_1gpu_e2e_faster_rcnn_R-50-FPN.yaml#L14" ) # group images which have similar aspect ratio. In this case, we only # group in two cases: those with width / height > 1, and the other way around, # but the code supports more general grouping strategy aspect_grouping = [1] if cfg.DATALOADER.ASPECT_RATIO_GROUPING else [] paths_catalog = import_file("maskrcnn_benchmark.config.paths_catalog", cfg.PATHS_CATALOG, True) DatasetCatalog = paths_catalog.DomainDatasetCatalog if is_train: if cfg.MODEL.DOMAIN_ADAPTATION_ON: dataset_list = [] for i, domain in enumerate(domains): if domain == 'clean': dataset_list.append(cfg.DATASETS.SOURCE_TRAIN) elif domain == 'foggy': dataset_list.append(cfg.DATASETS.FOGGY_TRAIN) elif domain == 'snowy': dataset_list.append(cfg.DATASETS.SNOWY_TRAIN) else: dataset_list = None raise NotImplementedError("Unknown domain") else: dataset_list = cfg.DATASETS.TRAIN else: dataset_list = cfg.DATASETS.TEST transforms = build_transforms(cfg, is_train) datasets = build_cls_dataset(dataset_list, transforms, DatasetCatalog, is_train, domains) data_loaders = [] for dataset in datasets: sampler = make_data_sampler(dataset, shuffle, is_distributed) batch_sampler = make_batch_data_sampler(dataset, sampler, aspect_grouping, images_per_gpu, num_iters, start_iter) collator = BatchCollator(cfg.DATALOADER.SIZE_DIVISIBILITY) num_workers = cfg.DATALOADER.NUM_WORKERS data_loader = torch.utils.data.DataLoader( dataset, num_workers=num_workers, batch_sampler=batch_sampler, collate_fn=collator, ) data_loaders.append(data_loader) #if is_train: # # during training, a single (possibly concatenated) data_loader is returned # assert len(data_loaders) == 1 # return data_loaders[0] return data_loaders
def do_train( cfg, total_model, data_loader, data_loader_val, optimizer, scheduler, checkpointer, device, checkpoint_period, test_period, arguments, args, ): if len(total_model) > 1: model = total_model[1] t_model = total_model[0] else: model = total_model[0] logger = logging.getLogger("maskrcnn_benchmark.trainer") logger.info("Start training") meters = MetricLogger(delimiter=" ") max_iter = len(data_loader) start_iter = arguments["iteration"] model.train() start_training_time = time.time() end = time.time() iou_types = ("bbox", ) if cfg[0].MODEL.MASK_ON: iou_types = iou_types + ("segm", ) if cfg[0].MODEL.KEYPOINT_ON: iou_types = iou_types + ("keypoints", ) dataset_names = cfg[0].DATASETS.TEST pytorch_1_1_0_or_later = is_pytorch_1_1_0_or_later() for iteration, (images, targets, _) in enumerate(data_loader, start_iter): data_time = time.time() - end iteration = iteration + 1 arguments["iteration"] = iteration # in pytorch >= 1.1.0, scheduler.step() should be run after optimizer.step() if not pytorch_1_1_0_or_later: scheduler.step() images = images.to(device) targets = [target.to(device) for target in targets] loss_dict, features_dict = model(images, targets) if len(total_model) > 1: with torch.no_grad(): t_loss_dict, t_features_dict = t_model(images, targets) # with torch.no_grad(): # # teacher_model = t_model # t_weight = torch.load('./weights/centermask-V-19-eSE-FPN-ms-3x.pth') # t_weight = t_weight['model'] # new_tweight = OrderedDict() # for k, v in t_weight.items(): # name = k[7:] # remove `module.` # new_tweight[name] = v # t_model.load_state_dict(new_tweight) # t_loss_dict, t_features_dict = t_model(images, targets) if args.loss_head: loss_regression = new_box_loss(t_loss_dict['loss_reg'], loss_dict['loss_reg']) loss_center = new_center_loss(t_loss_dict['loss_centerness'], loss_dict['loss_centerness']) mode = 'KL' # mode = 'KL' or 'cross-entropy' loss_pixel_wise = pixel_wise_loss(features_dict['box_cls'], t_features_dict['box_cls'], mode) loss_head = (loss_regression + loss_center + loss_pixel_wise) loss_dict.setdefault('loss_head', loss_head) del loss_dict['loss_reg'] del loss_dict['loss_centerness'] if iteration > cfg[0].SOLVER.WARMUP_ITERS: if args.loss_correlation: correlation = True loss_corr = get_feature(t_model, model, images, targets, correlation) loss_dict.setdefault('loss_corr', loss_corr) if args.loss_featuremap: correlation = False loss_featuremap = get_feature(t_model, model, images, targets, correlation) loss_dict.setdefault('loss_featuremap', loss_featuremap) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) meters.update(loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() losses.backward() optimizer.step() if pytorch_1_1_0_or_later: scheduler.step() batch_time = time.time() - end end = time.time() meters.update(time=batch_time, data=data_time) eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) if iteration % 20 == 0 or iteration == max_iter: logger.info( meters.delimiter.join([ "eta: {eta}", "iter: {iter}", "{meters}", "lr: {lr:.6f}", "max mem: {memory:.0f}", ]).format( eta=eta_string, iter=iteration, meters=str(meters), lr=optimizer.param_groups[0]["lr"], memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, )) if iteration % checkpoint_period == 0: checkpointer.save("model_{:07d}".format(iteration), **arguments) if data_loader_val is not None and test_period > 0 and iteration % test_period == 0 and iteration != 0: meters_val = MetricLogger(delimiter=" ") synchronize() _ = inference( # The result can be used for additional logging, e. g. for TensorBoard model, # The method changes the segmentation mask format in a data loader, # so every time a new data loader is created: make_data_loader(cfg[0], is_train=False, is_distributed=(get_world_size() > 1), is_for_period=True), dataset_name="[Validation]", iou_types=iou_types, box_only=False if cfg[0].MODEL.MASK_ON else cfg[0].MODEL.RPN_ONLY, device=cfg[0].MODEL.DEVICE, expected_results=cfg[0].TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg[0].TEST. EXPECTED_RESULTS_SIGMA_TOL, output_folder=None, ) synchronize() model.train() with torch.no_grad(): # Should be one image for each GPU: for iteration_val, (images_val, targets_val, _) in enumerate(tqdm(data_loader_val)): images_val = images_val.to(device) targets_val = [target.to(device) for target in targets_val] loss_dict = model(images_val, targets_val) if len(loss_dict) > 1: loss_dict = loss_dict[0] else: loss_dict = loss_dict losses = sum(loss for loss in loss_dict.values()) loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum( loss for loss in loss_dict_reduced.values()) meters_val.update(loss=losses_reduced, **loss_dict_reduced) synchronize() logger.info( meters_val.delimiter.join([ "[Validation]: ", "eta: {eta}", "iter: {iter}", "{meters}", "lr: {lr:.6f}", "max mem: {memory:.0f}", ]).format( eta=eta_string, iter=iteration, meters=str(meters_val), lr=optimizer.param_groups[0]["lr"], memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, )) if iteration == max_iter: checkpointer.save("model_final", **arguments) total_training_time = time.time() - start_training_time total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / (max_iter)))
def make_data_loader(root_path, cfg, is_train=True, is_distributed=False, start_iter=0, class_ids=None, ignore_labels=False): num_gpus = get_world_size() if is_train: images_per_batch = cfg.SOLVER.IMS_PER_BATCH assert (images_per_batch % num_gpus == 0 ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number " "of GPUs ({}) used.".format(images_per_batch, num_gpus) images_per_gpu = images_per_batch // num_gpus shuffle = True num_iters = cfg.SOLVER.MAX_ITER else: images_per_batch = cfg.TEST.IMS_PER_BATCH assert (images_per_batch % num_gpus == 0 ), "TEST.IMS_PER_BATCH ({}) must be divisible by the number " "of GPUs ({}) used.".format(images_per_batch, num_gpus) images_per_gpu = images_per_batch // num_gpus shuffle = False if not is_distributed else True num_iters = None start_iter = 0 if images_per_gpu > 1: logger = logging.getLogger( "maskrcnn_benchmark.dataset_gtboxframe.make_data_loader") logger.warning( "When using more than one image per GPU you may encounter " "an out-of-memory (OOM) error if your GPU does not have " "sufficient memory. If this happens, you can reduce " "SOLVER.IMS_PER_BATCH (for training) or " "TEST.IMS_PER_BATCH (for inference). For training, you must " "also adjust the learning rate and schedule length according " "to the linear scaling rule. See for example: " "https://github.com/facebookresearch/Detectron/blob/master/configs/getting_started/tutorial_1gpu_e2e_faster_rcnn_R-50-FPN.yaml#L14" ) # group images which have similar aspect ratio. In this case, we only # group in two cases: those with width / height > 1, and the other way around, # but the code supports more general grouping strategy aspect_grouping = [1] if cfg.DATALOADER.ASPECT_RATIO_GROUPING else [] transforms = None if not is_train and cfg.TEST.BBOX_AUG.ENABLED else build_transforms( cfg, is_train) dataset_list = cfg.DATASETS.TRAIN if is_train else cfg.DATASETS.TEST if not is_train and not ignore_labels: assert class_ids is not None, "For validation datasets, class_ids has to be provided!" datasets = [ build_detection_dataset_by_name(root_path, name, transforms, class_ids=class_ids, cache_images=False, ignore_labels=ignore_labels) for name in dataset_list ] if is_train: assert len( datasets ) == 1, "Can train on only one dataset, otherwise have to merge classes" class_ids = datasets[0].get_class_ids() data_loaders = [] for dataset in datasets: sampler = make_data_sampler(dataset, shuffle, is_distributed) batch_sampler = make_batch_data_sampler(dataset, sampler, aspect_grouping, images_per_gpu, num_iters, start_iter) collator = BBoxAugCollator() if not is_train and cfg.TEST.BBOX_AUG.ENABLED else \ BatchCollator(cfg.DATALOADER.SIZE_DIVISIBILITY) num_workers = cfg.DATALOADER.NUM_WORKERS data_loader = torch.utils.data.DataLoader( dataset, num_workers=num_workers, batch_sampler=batch_sampler, collate_fn=collator, ) data_loaders.append(data_loader) if is_train: # during training a single (possibly concatenated) data_loader is returned assert len(data_loaders) == 1 return data_loaders[0], class_ids return data_loaders
def do_train(model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, logger, tensorboard_writer: TensorboardWriter = None): logger.info("Start training") meters = MetricLogger(delimiter=" ") max_iter = len(data_loader) start_iter = arguments["iteration"] model.train() start_training_time = time.time() end = time.time() for iteration, (images, targets, _) in enumerate(data_loader, start_iter): if any(len(target) < 1 for target in targets): logger.error( "Iteration={iteration + 1} || Image Ids used for training {_} || " "targets Length={[len(target) for target in targets]}") continue data_time = time.time() - end iteration = iteration + 1 arguments["iteration"] = iteration scheduler.step() images = images.to(device) targets = [target.to(device) for target in targets] result, loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) meters.update(loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() # Note: If mixed precision is not used, this ends up doing nothing # Otherwise apply loss scaling for mixed-precision recipe with amp.scale_loss(losses, optimizer) as scaled_losses: scaled_losses.backward() optimizer.step() # write images / ground truth / evaluation metrics to tensorboard tensorboard_writer(iteration, losses_reduced, loss_dict_reduced, images, targets) batch_time = time.time() - end end = time.time() meters.update(time=batch_time, data=data_time) eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) if get_world_size() < 2 or dist.get_rank() == 0: if iteration % 20 == 0 or iteration == max_iter: logger.info( meters.delimiter.join([ "eta: {eta}", "iter: {iter}", "{meters}", "lr: {lr:.6f}", ]).format( eta=eta_string, iter=iteration, meters=str(meters), lr=optimizer.param_groups[0]["lr"], )) if iteration % checkpoint_period == 0: checkpointer.save("model_{:07d}".format(iteration), **arguments) if iteration == max_iter: checkpointer.save("model_final", **arguments) total_training_time = time.time() - start_training_time total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / (max_iter)))
def do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, vis, distributed, cfg, ): logger = logging.getLogger("maskrcnn_benchmark.trainer") logger.info("Start training") meters = MetricLogger(delimiter=" ") max_iter = len(data_loader) start_iter = arguments["iteration"] model.train() start_training_time = time.time() end = time.time() for iteration, (images, targets, _) in enumerate(data_loader, start_iter): #import pdb #pdb.set_trace() data_time = time.time() - end iteration = iteration + 1 arguments["iteration"] = iteration scheduler.step() images = images.to(device) targets = [target.to(device) for target in targets] loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) meters.update(loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() losses.backward() optimizer.step() batch_time = time.time() - end end = time.time() meters.update(time=batch_time, data=data_time) eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) if iteration % 20 == 0 or iteration == max_iter or iteration == 1: logger.info( meters.delimiter.join( [ "eta: {eta}", "iter: {iter}", "{meters}", "lr: {lr:.6f}", "max mem: {memory:.0f}", ] ).format( eta=eta_string, iter=iteration, meters=str(meters), lr=optimizer.param_groups[0]["lr"], memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, ) ) #loss_box_reg_mean = meters.meters['loss_box_reg'].global_avg #print(loss_box_reg_mean) if get_world_size()<2 or dist.get_rank() == 0: vis.add_value('loss', meters.meters['loss'].global_avg) vis.add_value('loss_box_reg', meters.meters['loss_box_reg'].global_avg) vis.add_value('loss_classifier', meters.meters['loss_classifier'].global_avg) vis.add_value('loss_objectness', meters.meters['loss_objectness'].global_avg) vis.add_value('loss_rpn_box_reg', meters.meters['loss_rpn_box_reg'].global_avg) # model.train() # results, results_coco=run_validation(cfg, model, distributed) # model.train() # # AP = results.results['bbox']['AP'] # AP50 = results.results['bbox']['AP50'] # AP75 = results.results['bbox']['AP75'] # APs = results.results['bbox']['APs'] # APm = results.results['bbox']['APm'] # APl = results.results['bbox']['APl'] # print("Inference--Iteration: {}, AP:{:.4f}".format(iteration, AP)) # if get_world_size()<2 or dist.get_rank() == 0: # vis.add_value('AP', AP) # vis.add_value('AP50', AP50) # vis.add_value('AP75', AP75) # vis.add_value('APs', APs) # vis.add_value('APm', APm) # vis.add_value('APl', APl) if iteration % checkpoint_period == 0: checkpointer.save("model_{:07d}".format(iteration), **arguments) if iteration == max_iter: checkpointer.save("model_final", **arguments) total_training_time = time.time() - start_training_time total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info( "Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / (max_iter) ) )
def do_train(model, cfg, data_loader, data_loader_val, optimizer, scheduler, checkpointer, device, checkpoint_period, test_period, arguments, output_dir='', visualize_loss='', vis_title='', iters_per_epoch=0): max_iter = len(data_loader) # arguments["iteration"] = max_iter start_iter = arguments["iteration"] if start_iter >= max_iter: checkpointer.save("model_{:07d}".format(start_iter), **arguments) checkpointer.save( "model_epoch_{:07d}".format( int(math.ceil(start_iter / iters_per_epoch))), **arguments) return logger = logging.getLogger("maskrcnn_benchmark.trainer") logger.info("Start training") meters = TensorboardXLogger(log_dir=os.path.join(output_dir, 'tensorboardX'), delimiter=" ") model.train() start_training_time = time.time() end = time.time() mkdir(output_dir) if visualize_loss == "visdom" and is_main_process(): from maskrcnn_benchmark.utils.visualization.visdom_visualizer import VisdomVisualizer vis_legend = None visualizer = VisdomVisualizer() else: visualizer = None scheduler.step(start_iter - 1) iou_types = ("bbox", ) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm", ) if cfg.MODEL.KEYPOINT_ON: iou_types = iou_types + ("keypoints", ) dataset_names = cfg.DATASETS.TEST if is_main_process(): tq = tqdm.tqdm(total=len(data_loader), initial=start_iter) for iteration, batch in enumerate(data_loader, start_iter): images = batch[0] if len(batch) > 2: info = batch[2] else: info = None reg_targets = None seg_targets = None if isinstance(batch[1], dict): roi_targets = batch[1]["roi_target"] if "reg_target" in batch[1].keys(): reg_targets = batch[1]["reg_target"] if "seg_target" in batch[1].keys(): seg_targets = batch[1]["seg_target"] else: roi_targets = batch[1] # for target in roi_targets: # print('labels: ', target.extra_fields['labels']) # print('second_labels: ', target.extra_fields['second_labels']) if any(len(target) < 1 for target in roi_targets): roi_targets = None # logger.error(f"Iteration={iteration+1} || Image Ids used for training {_} || targets Length={[len(target) for target in roi_targets]}") # continue # print(info) # print(roi_targets) data_time = time.time() - end iteration = iteration + 1 arguments["iteration"] = iteration if is_main_process(): tq.set_description('Iteration {}'.format(iteration)) tq.update(1) images = images.to(device) if roi_targets is not None: roi_targets = [target.to(device) for target in roi_targets] if reg_targets is not None: reg_targets = reg_targets.to(device) if seg_targets is not None: seg_targets = seg_targets.to(device) global_targets = dict(reg_targets=reg_targets, seg_targets=seg_targets) loss_dict = model(images, roi_targets, global_targets=global_targets) # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) meters.update(iteration, loss=losses_reduced, **loss_dict_reduced) losses = sum(loss for loss in loss_dict.values()) if visualizer: if vis_legend is None: vis_legend = [key for key in sorted(loss_dict_reduced.keys())] vis_legend.append('Total loss') iter_plot = visualizer.create_vis_plot('Iteration', 'loss', vis_title, vis_legend) for key in sorted(loss_dict_reduced.keys()): visualizer.update_vis_plot(iteration=iteration, loss=loss_dict_reduced[key], window=iter_plot, name=key, update_type='append') visualizer.update_vis_plot(iteration=iteration, loss=losses_reduced.data, window=iter_plot, name='Total loss', update_type='append') optimizer.zero_grad() # Note: If mixed precision is not used, this ends up doing nothing # Otherwise apply loss scaling for mixed-precision recipe if device == get_device(str_device='cuda'): with amp.scale_loss(losses, optimizer) as scaled_losses: scaled_losses.backward() else: # pooling doesn't support CPU # losses.backward() pass optimizer.step() scheduler.step() batch_time = time.time() - end end = time.time() meters.update(iteration, time=batch_time, data=data_time) eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) # tq.set_postfix( # log="max mem: {.0f}, lr: {.6f}, loss: {.6f}".format( # float(torch.cuda.max_memory_allocated() / 1024.0 / 1024.0), # float(optimizer.param_groups[0]["lr"]), # float(losses_reduced) # ) # ) if iteration % 20 == 0 or iteration == max_iter: logger.info( meters.delimiter.join([ "eta: {eta}", "iter: {iter}", "{meters}", "lr: {lr:.6f}", "max mem: {memory:.0f}", ]).format( eta=eta_string, iter=iteration, meters=str(meters), lr=optimizer.param_groups[0]["lr"], memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, )) if iteration % checkpoint_period == 0: checkpointer.save("model_{:07d}".format(iteration), **arguments) if iters_per_epoch > 0: if iteration % iters_per_epoch == 0: checkpointer.save( "model_epoch_{:07d}".format( int(math.ceil(iteration / iters_per_epoch))), **arguments) if data_loader_val is not None and test_period > 0 and iteration % test_period == 0: meters_val = TensorboardXLogger(log_dir=None, delimiter=" ") synchronize() _ = inference( # The result can be used for additional logging, e. g. for TensorBoard model, # The method changes the segmentation mask format in a data loader, # so every time a new data loader is created: make_data_loader(cfg, is_train=False, is_distributed=(get_world_size() > 1), is_for_period=True), dataset_name="[Validation]", iou_types=iou_types, box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, bbox_aug=cfg.TEST.BBOX_AUG.ENABLED, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=None, ) synchronize() model.train() with torch.no_grad(): # Should be one image for each GPU: for iteration_val, (images_val, targets_val, _) in enumerate(tqdm(data_loader_val)): images_val = images_val.to(device) targets_val = [target.to(device) for target in targets_val] loss_dict = model(images_val, targets_val) losses = sum(loss for loss in loss_dict.values()) loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum( loss for loss in loss_dict_reduced.values()) meters_val.update(loss=losses_reduced, **loss_dict_reduced) synchronize() logger.info( meters_val.delimiter.join([ "[Validation]: ", "eta: {eta}", "iter: {iter}", "{meters}", "lr: {lr:.6f}", "max mem: {memory:.0f}", ]).format( eta=eta_string, iter=iteration, meters=str(meters_val), lr=optimizer.param_groups[0]["lr"], memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, )) iteration = len(data_loader) if is_main_process(): tq.close() checkpointer.save("model_{:07d}".format(iteration), **arguments) checkpointer.save( "model_epoch_{:07d}".format(int(math.ceil( iteration / iters_per_epoch))), **arguments) total_training_time = time.time() - start_training_time total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / max_iter))
) ) summary_writter.add_scalar('losses/total_loss', losses_reduced, global_step=iteration) for loss_name, loss_item in loss_dict_reduced.items(): summary_writter.add_scalar('losses/{}'.format(loss_name), loss_item, global_step=iteration) summary_writter.add_scalar('lr', optimizer.param_groups[0]['lr'], global_step=iteration) if iteration % checkpoint_period == 0: checkpointer.save("model_{:07d}".format(iteration), **arguments) if data_loader_val is not None and test_period > 0 and iteration % test_period == 0: meters_val = MetricLogger(delimiter=" ") synchronize() _ = inference( # The result can be used for additional logging, e. g. for TensorBoard model, # The method changes the segmentation mask format in a data loader, # so every time a new data loader is created: make_data_loader(cfg, is_train=False, is_distributed=(get_world_size() > 1), is_for_period=True), dataset_name="[Validation]", iou_types=iou_types, box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=None, ) synchronize() model.train() with torch.no_grad(): # Should be one image for each GPU: for iteration_val, (images_val, targets_val, _) in enumerate(tqdm(data_loader_val)): images_val = images_val.to(device) targets_val = [target.to(device) for target in targets_val]
def make_data_loader(cfg, is_train=True, is_distributed=False, start_iter=0): num_gpus = get_world_size() if is_train: images_per_batch = cfg.SOLVER.IMS_PER_BATCH # 2 assert (images_per_batch % num_gpus == 0 ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number " "of GPUs ({}) used.".format(images_per_batch, num_gpus) images_per_gpu = images_per_batch // num_gpus shuffle = True num_iters = cfg.SOLVER.MAX_ITER # 720000 else: images_per_batch = cfg.TEST.IMS_PER_BATCH # 1 assert (images_per_batch % num_gpus == 0 ), "TEST.IMS_PER_BATCH ({}) must be divisible by the number " "of GPUs ({}) used.".format(images_per_batch, num_gpus) images_per_gpu = images_per_batch // num_gpus shuffle = False if not is_distributed else True num_iters = None start_iter = 0 if images_per_gpu > 1: logger = logging.getLogger(__name__) logger.warning( "When using more than one image per GPU you may encounter " "an out-of-memory (OOM) error if your GPU does not have " "sufficient memory. If this happens, you can reduce " "SOLVER.IMS_PER_BATCH (for training) or " "TEST.IMS_PER_BATCH (for inference). For training, you must " "also adjust the learning rate and schedule length according " "to the linear scaling rule. See for example: " "https://github.com/facebookresearch/Detectron/blob/master/configs/getting_started/tutorial_1gpu_e2e_faster_rcnn_R-50-FPN.yaml#L14" ) # group images which have similar aspect ratio. In this case, we only # group in two cases: those with width / height > 1, and the other way around, # but the code supports more general grouping strategy # 默认为True aspect_grouping = [1] if cfg.DATALOADER.ASPECT_RATIO_GROUPING else [] paths_catalog = import_file("maskrcnn_benchmark.config.paths_catalog", cfg.PATHS_CATALOG, True) # maskrcnn_benchmark.config.paths_catalog.py DatasetCatalog = paths_catalog.DatasetCatalog # 将DatasetCatalog对象赋值给该变量 # ("coco_2014_train", "coco_2014_valminusminival") for train # ("coco_2014_minival",) for test dataset_list = cfg.DATASETS.TRAIN if is_train else cfg.DATASETS.TEST # transform中传入的target指的是image对应的BoxList对象 transforms = build_transforms(cfg, is_train) # 创建COCODataset对象 datasets = build_dataset(dataset_list, transforms, DatasetCatalog, is_train) data_loaders = [] for dataset in datasets: # 这里创建RandomSampler对象 sampler = make_data_sampler(dataset, shuffle, is_distributed) # 创建BatchSampler对象 batch_sampler = make_batch_data_sampler(dataset, sampler, aspect_grouping, images_per_gpu, num_iters, start_iter) # todo: 目前不清楚这个干啥的, 看看再说 collator = BatchCollator(cfg.DATALOADER.SIZE_DIVISIBILITY) # 32 # Number of data loading threads, 4 num_workers = cfg.DATALOADER.NUM_WORKERS data_loader = torch.utils.data.DataLoader( dataset, num_workers=num_workers, batch_sampler=batch_sampler, collate_fn=collator, ) data_loaders.append(data_loader) if is_train: # during training, a single (possibly concatenated) data_loader is returned assert len(data_loaders) == 1 return data_loaders[0] return data_loaders
def mlperf_test_early_exit(iteration, iters_per_epoch, tester, model, distributed, min_bbox_map, min_segm_map): # Note: let iters / epoch == 10k, at iter 9999 we've finished epoch 0 and need to test if iteration > 0 and (iteration + 1) % iters_per_epoch == 0: synchronize() epoch = iteration // iters_per_epoch + 1 mlperf_print(key=constants.EPOCH_STOP, metadata={"epoch_num": epoch}) mlperf_print(key=constants.BLOCK_STOP, metadata={"first_epoch_num": epoch}) mlperf_print(key=constants.EVAL_START, metadata={"epoch_num": epoch}) # set the async evaluator's tag correctly set_epoch_tag(epoch) # Note: No longer returns anything, underlying future is in another castle tester(model=model, distributed=distributed) # necessary for correctness model.train() else: # Otherwise, check for finished async results results = check_completed_tags() # on master process, check each result for terminating condition # sentinel for run finishing finished = 0 if is_main_process(): for result_epoch, (bbox_map, segm_map) in results.items(): # mlperf_print(key=constants.EVAL_TARGET, value={"BBOX": min_bbox_map, # "SEGM": min_segm_map}) logger = logging.getLogger('maskrcnn_benchmark.trainer') logger.info('bbox mAP: {}, segm mAP: {}'.format( bbox_map, segm_map)) mlperf_print( key=constants.EVAL_ACCURACY, value={"accuracy": { "BBOX": bbox_map, "SEGM": segm_map }}, metadata={"epoch_num": result_epoch}) mlperf_print(key=constants.EVAL_STOP, metadata={"epoch_num": result_epoch}) # terminating condition if bbox_map >= min_bbox_map and segm_map >= min_segm_map: logger.info("Target mAP reached, exiting...") finished = 1 #return True # We now know on rank 0 whether or not we should terminate # Bcast this flag on multi-GPU if get_world_size() > 1: with torch.no_grad(): finish_tensor = torch.tensor([finished], dtype=torch.int32, device=torch.device('cuda')) torch.distributed.broadcast(finish_tensor, 0) # If notified, end. if finish_tensor.item() == 1: return True else: # Single GPU, don't need to create tensor to bcast, just use value directly if finished == 1: return True # Otherwise, default case, continue return False
def make_mt_data_loader(cfg, is_train=True, is_distributed=False, start_iter=0, mode='source', img_ratio=1.): num_gpus = get_world_size() dataset_list_dict = { 'source': 'papnuclei_source', 'no_label': 'papnuclei_no_label', } if is_train: images_per_batch = int(cfg.SOLVER.IMS_PER_BATCH * img_ratio) assert (images_per_batch % num_gpus == 0 ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number " "of GPUs ({}) used.".format(images_per_batch, num_gpus) images_per_gpu = images_per_batch // num_gpus shuffle = True num_iters = cfg.SOLVER.MAX_ITER else: images_per_batch = cfg.TEST.IMS_PER_BATCH assert (images_per_batch % num_gpus == 0 ), "TEST.IMS_PER_BATCH ({}) must be divisible by the number " "of GPUs ({}) used.".format(images_per_batch, num_gpus) images_per_gpu = images_per_batch // num_gpus shuffle = False if not is_distributed else True num_iters = None start_iter = 0 if images_per_gpu > 1: logger = logging.getLogger(__name__) logger.warning( "When using more than one image per GPU you may encounter " "an out-of-memory (OOM) error if your GPU does not have " "sufficient memory. If this happens, you can reduce " "SOLVER.IMS_PER_BATCH (for training) or " "TEST.IMS_PER_BATCH (for inference). For training, you must " "also adjust the learning rate and schedule length according " "to the linear scaling rule. See for example: " "https://github.com/facebookresearch/Detectron/blob/master/configs/getting_started/tutorial_1gpu_e2e_faster_rcnn_R-50-FPN.yaml#L14" ) # group images which have similar aspect ratio. In this case, we only # group in two cases: those with width / height > 1, and the other way around, # but the code supports more general grouping strategy aspect_grouping = [1] if cfg.DATALOADER.ASPECT_RATIO_GROUPING else [] paths_catalog = import_file("maskrcnn_benchmark.config.paths_catalog", cfg.PATHS_CATALOG, True) DatasetCatalog = paths_catalog.DatasetCatalog if is_train: dataset = dataset_list_dict[mode] elif cfg.DATASETS.MODE_IN_TEST == 'val': dataset = cfg.DATASETS.VAL else: dataset = cfg.DATASETS.TEST transforms = build_transforms(cfg, is_train, domain=mode) syn_mt = True if cfg.SYN.MT_LOSS > 0 else False dataset = build_dataset([dataset], transforms, DatasetCatalog, is_train, aug_k=cfg.MT.AUG_K + cfg.MT.AUG_S, syn_mt=syn_mt, gen_true=cfg.DATASETS.GEN_TRUE) collators = build_collator(cfg, mode) dataloader = build_mt_data_loader(dataset[0], shuffle, is_distributed, aspect_grouping, images_per_gpu, num_iters, start_iter, cfg, is_train, collators) return dataloader
def make_data_loader(cfg, is_train=True, is_distributed=False, start_iter=0, shuffle=None): # add by hui num_gpus = get_world_size() if is_train: images_per_batch = cfg.SOLVER.IMS_PER_BATCH assert (images_per_batch % num_gpus == 0 ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number " "of GPUs ({}) used.".format(images_per_batch, num_gpus) images_per_gpu = images_per_batch // num_gpus if shuffle is None: shuffle = True num_iters = cfg.SOLVER.MAX_ITER # ############################## add by hui ######################################## balance_normal = cfg.DATALOADER.USE_TRAIN_BALANCE_NORMAL normal_ratio = cfg.DATALOADER.TRAIN_NORMAL_RATIO remove_images_without_annotations = not balance_normal filter_ignore = cfg.DATASETS.COCO_DATASET.TRAIN_FILTER_IGNORE ################################################################################ else: images_per_batch = cfg.TEST.IMS_PER_BATCH assert (images_per_batch % num_gpus == 0 ), "TEST.IMS_PER_BATCH ({}) must be divisible by the number " "of GPUs ({}) used.".format(images_per_batch, num_gpus) images_per_gpu = images_per_batch // num_gpus if shuffle is None: shuffle = False if not is_distributed else True num_iters = None start_iter = 0 # ############################## add by hui ######################################## balance_normal = cfg.DATALOADER.USE_TEST_BALANCE_NORMAL normal_ratio = cfg.DATALOADER.TEST_NORMAL_RATIO if balance_normal: shuffle = True remove_images_without_annotations = False filter_ignore = cfg.DATASETS.COCO_DATASET.TEST_FILTER_IGNORE ################################################################################ if cfg.DATALOADER.DEBUG.CLOSE_SHUFFLE: # add by hui shuffle = False if images_per_gpu > 1: logger = logging.getLogger(__name__) logger.warning( "When using more than one image per GPU you may encounter " "an out-of-memory (OOM) error if your GPU does not have " "sufficient memory. If this happens, you can reduce " "SOLVER.IMS_PER_BATCH (for training) or " "TEST.IMS_PER_BATCH (for inference). For training, you must " "also adjust the learning rate and schedule length according " "to the linear scaling rule. See for example: " "https://github.com/facebookresearch/Detectron/blob/master/configs/getting_started/tutorial_1gpu_e2e_faster_rcnn_R-50-FPN.yaml#L14" ) # group images which have similar aspect ratio. In this case, we only # group in two cases: those with width / height > 1, and the other way around, # but the code supports more general grouping strategy aspect_grouping = [1] if cfg.DATALOADER.ASPECT_RATIO_GROUPING else [] paths_catalog = import_file("maskrcnn_benchmark.config.paths_catalog", cfg.PATHS_CATALOG, True) DatasetCatalog = paths_catalog.DatasetCatalog dataset_list = cfg.DATASETS.TRAIN if is_train else cfg.DATASETS.TEST transforms = build_transforms(cfg, is_train) datasets = build_dataset(dataset_list, transforms, DatasetCatalog, is_train, remove_images_without_annotations, filter_ignore) # add by hui data_loaders = [] for dataset in datasets: sampler = make_data_sampler(dataset, shuffle, is_distributed, balance_normal, normal_ratio) # changed by hui batch_sampler = make_batch_data_sampler(dataset, sampler, aspect_grouping, images_per_gpu, num_iters, start_iter) collator = BatchCollator(cfg.DATALOADER.SIZE_DIVISIBILITY) num_workers = cfg.DATALOADER.NUM_WORKERS data_loader = torch.utils.data.DataLoader( dataset, num_workers=num_workers, batch_sampler=batch_sampler, collate_fn=collator, timeout=0, # add by hui for big batch ) data_loaders.append(data_loader) if is_train: # during training, a single (possibly concatenated) data_loader is returned assert len(data_loaders) == 1 return data_loaders[0] return data_loaders
def do_train( model, model_ema, data_loader, optimizer, scheduler, checkpointer, device, local_rank, checkpoint_period, cfg_arg, arguments, ): logger = logging.getLogger("maskrcnn_benchmark.trainer") logger.info("Start training") meters = MetricLogger(delimiter=" ") meters_ema = MetricLogger(delimiter=" ") max_iter = len(data_loader) start_iter = arguments["iteration"] ema_decay = arguments["ema_decay"] loss_semi = arguments['loss_semi'] temporal_save_path = cfg_arg["temporal_save_path"] model.train() model_ema.train() box_coder = BoxCoder(weights=(10., 10., 5., 5.)) temporal_ens = {} start_training_time = time.time() end = time.time() labeled_database = arguments["HYPER_PARAMETERS"]['LABELED_DATABASE'] temporal_supervised_losses = [] for iteration, (images, targets_with_trans_info, idx) in enumerate(data_loader, start_iter): targets = [_iter[0] for _iter in targets_with_trans_info] trans_info = [_iter[1] for _iter in targets_with_trans_info] try: db_idx, img_idx, idx_name, bboxes_batch = map_to_img( data_loader, idx) temporal_ens_bboxes = [ ensemble_bboxes(_boxes, _im_sz, arguments["ANCHOR_STRIDES"], arguments["HYPER_PARAMETERS"]['ENS_THRE'], device) for _boxes, _im_sz in zip(bboxes_batch, images.image_sizes) ] img_size = [(_sz[1], _sz[0]) for _sz in images.image_sizes] pred_trans_info = copy.deepcopy(trans_info) temporal_ens_pred = [] for i, _sz in enumerate(img_size): pred_trans_info[i][1] = _sz temporal_ens_per = [ trans_reverse(_temporal_ens, pred_trans_info[i]).to(device) for _temporal_ens in temporal_ens_bboxes[i] ] temporal_ens_pred.append(temporal_ens_per) db_w = [] for i, _db in enumerate(db_idx): if _db not in labeled_database: _bbox = BoxList( torch.zeros([1, 4]), (images.image_sizes[i][1], images.image_sizes[i][0]), mode="xyxy") _bbox.add_field('labels', torch.ones([1])) targets[i] = _bbox db_w.append(0.) else: db_w.append(1.) if any(len(target) < 1 for target in targets): logger.error( f"Iteration={iteration + 1} || Image Ids used for training {_} || targets Length={[len(target) for target in targets]}" ) continue data_time = time.time() - end iteration = iteration + 1 arguments["iteration"] = iteration images = images.to(device) targets = [target.to(device) for target in targets] update_ema_variables(model, model_ema, ema_decay, iteration) _loss_dict, result = model(images, targets) #---------------------loss masked by with torch.no_grad(): _loss_dict_ema, result_ema = model_ema(images, targets) is_labeled_db_weight = torch.tensor( db_w, dtype=torch.float32).to(device) loss_dict = {} loss_dict_ema = {} for _key in _loss_dict.keys(): loss_dict[_key] = torch.sum( torch.stack(_loss_dict[_key], dim=0) * is_labeled_db_weight) loss_dict_ema[_key] = torch.sum( torch.stack(_loss_dict_ema[_key], dim=0) * is_labeled_db_weight) # loss_dict = _loss_dict # loss_dict_ema = _loss_dict_ema #result_origin = [trans_reverse(_res,_info) for _res,_info in zip(result_ema,trans_info)] #result_origin = predict_collect_postprocess(arguments['postprocess'],result_ema,trans_info) result_origin = predict_retina_postprocess( arguments['postprocess'], box_coder, result_ema, trans_info, images.image_sizes) # any_zeros = [_iter.bbox.shape[0] == 0 for _iter in temporal_ens_pred] # if any(any_zeros): # loss_dict['semi_box_reg'] = torch.tensor(0,dtype=torch.float32,device=device) # loss_dict['semi_cls'] = torch.tensor(0,dtype=torch.float32,device=device) # else: # semi_loss = loss_semi( # result, temporal_ens_pred) # for _key in semi_loss.keys(): # loss_dict[_key] = torch.sum(torch.stack(semi_loss[_key],dim=0) * (1 - db_weight)) * arguments["semi_weight"] #balance losses with torch.no_grad(): supversed_loss = (loss_dict['loss_retina_cls'] + loss_dict['loss_retina_reg']) / ( np.sum(db_w) + 0.1) temporal_supervised_losses.append(supversed_loss) temporal_supervised_losses = temporal_supervised_losses[-100:] sup_loss = torch.stack(temporal_supervised_losses).mean() meters.update(sup_loss=sup_loss) if get_world_size() > 1: torch.distributed.all_reduce( torch.stack(temporal_supervised_losses).mean(), op=torch.distributed.ReduceOp.SUM) balance_weight = min(1. / (sup_loss / 0.28)**12, 1.) semi_loss = semi_loss_fn( result, result_ema, temporal_ens_pred, images.image_sizes, box_coder, n_cls=arguments["HYPER_PARAMETERS"]['NCLS'], reg_cons_w=arguments["HYPER_PARAMETERS"]['REG_CONSIST_WEIGHT']) semi_loss_weight = semi_weight_by_epoch( iteration, start_iter=arguments["HYPER_PARAMETERS"]['EPOCH_BATCH_NUM'] * arguments["HYPER_PARAMETERS"]['START_ITER'], rampup_length=arguments["HYPER_PARAMETERS"]['EPOCH_BATCH_NUM'] * arguments["HYPER_PARAMETERS"]['RAMPUP_LENGTH'], consistence_weight=arguments["HYPER_PARAMETERS"] ['CONSISTENCE_WEIGHT'], consistence_trunc=arguments["HYPER_PARAMETERS"] ['MAX_CONSISTENT_LOSS']) #semi_weight_by_epoch(iteration) for _key in semi_loss.keys(): #loss_dict[_key] = torch.sum(semi_loss[_key] * (1 - is_labeled_db_weight))*semi_loss_weight*balance_weight # not used labeled loss_dict[_key] = torch.sum(semi_loss[_key]) * semi_loss_weight for i, (_id, _labeled) in enumerate(zip(idx_name, db_w)): # if _labeled == 1: # continue result_dict = { 'iteration': iteration, 'result': result_origin[i] } if _id in temporal_ens.keys(): temporal_ens[_id].append(result_dict) else: temporal_ens[_id] = [result_dict] #print('id={},{},scores={}----------{}'.format(idx_name[0],idx_name[1],result_origin[0].get_field('objectness')[:5],result_origin[1].get_field('objectness')[:5])) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) meters.update(loss=losses_reduced, **loss_dict_reduced) loss_dict_reduced_ema = reduce_loss_dict(loss_dict_ema) losses_reduced_ema = sum( loss for loss in loss_dict_reduced_ema.values()) meters_ema.update(loss=losses_reduced_ema, **loss_dict_reduced_ema) optimizer.zero_grad() # Note: If mixed precision is not used, this ends up doing nothing # Otherwise apply loss scaling for mixed-precision recipe with amp.scale_loss(losses, optimizer) as scaled_losses: scaled_losses.backward() if not iteration < arguments["HYPER_PARAMETERS"][ 'EPOCH_BATCH_NUM'] * arguments["HYPER_PARAMETERS"][ 'START_ITER']: optimizer.step() #scheduler.step() batch_time = time.time() - end end = time.time() meters.update(time=batch_time, data=data_time) eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) if iteration % 20 == 0 or iteration == max_iter: logger.info( meters.delimiter.join([ "eta: {eta}", "iter: {iter}", "{meters}", "{meters_ema}", "lr: {lr:.6f}", "semi_w:{semi_w:2.3f}", "supervised loss{sup_loss:2.3f}," "balance_weight{balance_weight:2.3f}," "max mem: {memory:.0f}", ]).format( eta=eta_string, iter=iteration, meters=str(meters), meters_ema=str(meters_ema), lr=optimizer.param_groups[0]["lr"], semi_w=semi_loss_weight, sup_loss=sup_loss, balance_weight=balance_weight, memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, )) if (iteration - 50) % 100 == 0: for _key in temporal_ens.keys(): for _iter in temporal_ens[_key]: str_folder = os.path.join( temporal_save_path, _key) #"{}/{}".format(temporal_save_path,_key) str_file = '{}/{}_loc{}_iter_x{:07d}.pt'.format( str_folder, _key, local_rank, _iter['iteration']) if not os.path.exists(str_folder): os.makedirs(str_folder) torch.save(_iter['result'], str_file) del _iter['result'] del temporal_ens temporal_ens = {} if iteration % checkpoint_period == 0: save_time = time.time() checkpointer.save("model_{:07d}".format(iteration), **arguments) if iteration == max_iter: checkpointer.save("model_final", **arguments) except Exception as e: print('error in file ', idx_name, img_idx) raise e total_training_time = time.time() - start_training_time total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / (max_iter)))
def do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, log_step=20, data_partition=1, explicit_average_grad=False, no_update=False, ): logger = logging.getLogger("maskrcnn_benchmark.trainer") logger.info("Start training") meters = MetricLogger(delimiter=" ") max_iter = len(data_loader) start_iter = arguments["iteration"] start_training_time = time.time() end = time.time() log_start = time.time() from qd.qd_common import is_hvd_initialized use_hvd = is_hvd_initialized() visualize_input = False fix_input = False for iteration, (images, targets, _) in enumerate(data_loader, start_iter): if hasattr(images, 'image_sizes') and len(images.image_sizes) == 0: logging.error('this should never happen since different workers ' 'will have different numbers of iterations.') continue if fix_input: logging.info('fix input') from qd.qd_common import run_if_not_memory_cached def get_x(x): return x images = run_if_not_memory_cached(get_x, images, __key='images') targets = run_if_not_memory_cached(get_x, targets, __key='targets') if visualize_input: from qd.qd_pytorch import visualize_maskrcnn_input visualize_maskrcnn_input(images, targets, show_box=True) data_time = time.time() - end iteration = iteration + 1 arguments["iteration"] = iteration if not no_update: scheduler.step() if isinstance(images, list): images = [x.to(device) for x in images] else: images = images.to(device) if isinstance(targets, torch.Tensor): targets = targets.to(device) else: targets = [target.to(device) for target in targets] if not no_update: optimizer.zero_grad() all_image_target = partition_data(images, targets, data_partition) for curr_images, curr_target in all_image_target: forward_backward(model, curr_images, curr_target, optimizer, arguments, checkpointer, use_hvd, meters, device, loss_scalar=1. / data_partition, no_update=no_update) if explicit_average_grad: average_gradients(model) if not no_update: optimizer.step() batch_time = time.time() - end end = time.time() if iteration > start_iter + 5: # we will skip the first few iterations since the time cost # evaluation for those are not good meters.update(time=batch_time, data=data_time) if iteration % log_step == 0 or iteration == max_iter: speed = get_world_size() * log_step * len(targets) / (time.time() - log_start) if hasattr(meters, 'time'): eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) else: eta_string = 'Unknown' logger.info( meters.delimiter.join([ "eta: {eta}", "iter: {iter}", 'speed: {speed:.1f} images/sec', "{meters}", "lr: {lr:.6f}", "max mem: {memory:.0f}", ]).format( eta=eta_string, iter=iteration, speed=speed, meters=str(meters), lr=optimizer.param_groups[0]["lr"], memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, )) log_start = time.time() if iteration % checkpoint_period == 0: # with blobfuse, saving could fail with unknown reason. Instead of # saving and crashing, we do a best-effort manner. try_save_intermediate_snapshot(checkpointer, iteration, arguments) checkpointer.save("model_final", **arguments) if get_rank() > 0: old_value = checkpointer.save_to_disk checkpointer.save_to_disk = True checkpointer.save("model_final_{}".format(get_rank()), **arguments) checkpointer.save_to_disk = old_value total_training_time = time.time() - start_training_time total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / (1 if max_iter == 0 else max_iter)))
def inference( model, data_loader, dataset_name, iou_types=("bbox", ), box_only=False, bbox_aug=False, device="cuda", expected_results=(), expected_results_sigma_tol=4, output_folder=None, ): # convert to a torch.device for efficiency device = torch.device(device) num_devices = get_world_size() logger = logging.getLogger("maskrcnn_benchmark.inference") dataset = data_loader.dataset logger.info("Start evaluation on {} dataset({} images).".format( dataset_name, len(dataset))) total_timer = Timer() inference_timer = Timer() total_timer.tic() predictions = compute_on_dataset(model, data_loader, device, bbox_aug, inference_timer) # wait for all processes to complete before measuring the time synchronize() print('>>>>>>==============results_dict_cpu.keys()=', len(predictions.keys()), predictions.keys()) total_time = total_timer.toc() total_time_str = get_time_str(total_time) logger.info( "Total run time: {} ({} s / img per device, on {} devices)".format( total_time_str, total_time * num_devices / len(dataset), num_devices)) total_infer_time = get_time_str(inference_timer.total_time) logger.info( "Model inference time: {} ({} s / img per device, on {} devices)". format( total_infer_time, inference_timer.total_time * num_devices / len(dataset), num_devices, )) predictions = _accumulate_predictions_from_multiple_gpus(predictions) print('>>>>>><<<<<<<<<<<==============results_dict_cpu.keys()=', len(predictions)) print(predictions[0]) if not is_main_process(): return if output_folder: torch.save(predictions, os.path.join(output_folder, "predictions.pth")) extra_args = dict( box_only=box_only, iou_types=iou_types, expected_results=expected_results, expected_results_sigma_tol=expected_results_sigma_tol, ) return evaluate(dataset=dataset, predictions=predictions, output_folder=output_folder, **extra_args)
def make_data_loader(cfg, annotations, classes, is_train=True, is_distributed=False, start_iter=0): num_gpus = get_world_size() if is_train: images_per_batch = cfg.SOLVER.IMS_PER_BATCH assert ( images_per_batch % num_gpus == 0 ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number " "of GPUs ({}) used.".format(images_per_batch, num_gpus) images_per_gpu = images_per_batch // num_gpus shuffle = True num_iters = cfg.SOLVER.MAX_ITER else: images_per_batch = cfg.TEST.IMS_PER_BATCH assert ( images_per_batch % num_gpus == 0 ), "TEST.IMS_PER_BATCH ({}) must be divisible by the number " "of GPUs ({}) used.".format(images_per_batch, num_gpus) images_per_gpu = images_per_batch // num_gpus shuffle = False if not is_distributed else True num_iters = None start_iter = 0 if images_per_gpu > 1: logger = logging.getLogger(__name__) logger.warning( "When using more than one image per GPU you may encounter " "an out-of-memory (OOM) error if your GPU does not have " "sufficient memory. If this happens, you can reduce " "SOLVER.IMS_PER_BATCH (for training) or " "TEST.IMS_PER_BATCH (for inference). For training, you must " "also adjust the learning rate and schedule length according " "to the linear scaling rule. See for example: " "https://github.com/facebookresearch/Detectron/blob/master/configs/getting_started/tutorial_1gpu_e2e_faster_rcnn_R-50-FPN.yaml#L14" ) # group images which have similar aspect ratio. In this case, we only # group in two cases: those with width / height > 1, and the other way around, # but the code supports more general grouping strategy aspect_grouping = [1] if cfg.DATALOADER.ASPECT_RATIO_GROUPING else [] transforms = build_transforms(cfg, is_train) datasets = [CustomDataset(annotations, transforms=transforms, classes=classes)] data_loaders = [] for dataset in datasets: sampler = make_data_sampler(dataset, shuffle, is_distributed) batch_sampler = make_batch_data_sampler( dataset, sampler, aspect_grouping, images_per_gpu, num_iters, start_iter ) collator = BatchCollator(cfg.DATALOADER.SIZE_DIVISIBILITY) num_workers = cfg.DATALOADER.NUM_WORKERS data_loader = torch.utils.data.DataLoader( dataset, num_workers=num_workers, batch_sampler=batch_sampler, collate_fn=collator, ) data_loaders.append(data_loader) if is_train: # during training, a single (possibly concatenated) data_loader is returned assert len(data_loaders) == 1 return data_loaders[0] return data_loaders
def make_data_loader(cfg, is_train=True, is_distributed=False, start_iter=0): num_gpus = get_world_size() if is_train: images_per_batch = cfg.SOLVER.IMS_PER_BATCH assert ( images_per_batch % num_gpus == 0 ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number " "of GPUs ({}) used.".format(images_per_batch, num_gpus) images_per_gpu = images_per_batch // num_gpus shuffle = True num_iters = cfg.SOLVER.MAX_ITER else: images_per_batch = cfg.TEST.IMS_PER_BATCH assert ( images_per_batch % num_gpus == 0 ), "TEST.IMS_PER_BATCH ({}) must be divisible by the number " "of GPUs ({}) used.".format(images_per_batch, num_gpus) images_per_gpu = images_per_batch // num_gpus shuffle = False if not is_distributed else True num_iters = None start_iter = 0 if images_per_gpu > 1: logger = logging.getLogger(__name__) logger.warning( "When using more than one image per GPU you may encounter " "an out-of-memory (OOM) error if your GPU does not have " "sufficient memory. If this happens, you can reduce " "SOLVER.IMS_PER_BATCH (for training) or " "TEST.IMS_PER_BATCH (for inference). For training, you must " "also adjust the learning rate and schedule length according " "to the linear scaling rule. See for example: " "https://github.com/facebookresearch/Detectron/blob/master/configs/getting_started/tutorial_1gpu_e2e_faster_rcnn_R-50-FPN.yaml#L14" ) # group images which have similar aspect ratio. In this case, we only # group in two cases: those with width / height > 1, and the other way around, # but the code supports more general grouping strategy aspect_grouping = [1] if cfg.DATALOADER.ASPECT_RATIO_GROUPING else [] paths_catalog = import_file( "maskrcnn_benchmark.config.paths_catalog", cfg.PATHS_CATALOG, True ) DatasetCatalog = paths_catalog.DatasetCatalog dataset_list = cfg.DATASETS.TRAIN if is_train else cfg.DATASETS.TEST transforms = build_transforms(cfg, is_train) datasets = build_dataset(dataset_list, transforms, DatasetCatalog, is_train) data_loaders = [] for dataset in datasets: sampler = make_data_sampler(dataset, shuffle, is_distributed) batch_sampler = make_batch_data_sampler( dataset, sampler, aspect_grouping, images_per_gpu, num_iters, start_iter ) collator = BatchCollator(cfg.DATALOADER.SIZE_DIVISIBILITY) num_workers = cfg.DATALOADER.NUM_WORKERS data_loader = torch.utils.data.DataLoader( dataset, num_workers=num_workers, batch_sampler=batch_sampler, collate_fn=collator, ) data_loaders.append(data_loader) if is_train: # during training, a single (possibly concatenated) data_loader is returned assert len(data_loaders) == 1 return data_loaders[0] return data_loaders
def do_train( cfg, model, data_loader, data_loader_val, optimizer, scheduler, checkpointer, device, checkpoint_period, test_period, arguments, writer, ): logger = logging.getLogger("maskrcnn_benchmark.trainer") logger.info("Start training") meters = MetricLogger(delimiter=" ") max_iter = len(data_loader) start_iter = arguments["iteration"] model.train() start_training_time = time.time() end = time.time() iou_types = ("bbox", ) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm", ) if cfg.MODEL.KEYPOINT_ON: iou_types = iou_types + ("keypoints", ) dataset_names = cfg.DATASETS.TEST for iteration, (images, targets, _) in enumerate(data_loader, start_iter): if any(len(target) < 1 for target in targets): logger.error( f"Iteration={iteration + 1} || Image Ids used for training {_} || targets Length={[len(target) for target in targets]}" ) continue data_time = time.time() - end iteration = iteration + 1 arguments["iteration"] = iteration images = images.to(device) targets = [target.to(device) for target in targets] loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) meters.update(loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() # # Add images every 100 iterations if iteration % 100 == 0: # # Display images # image = images.tensors[0].cpu().numpy() # means = np.zeros((image.shape[0], image.shape[1], image.shape[2])) # means[0] = 102.9801 # means[1] = 115.9465 # means[2] = 122.7717 # image = image + means # image = image[[2, 1, 0]].astype(np.uint8) # writer.add_image('input image', image, iteration) # for b in range(len(targets[0].bbox)): # box = targets[0].bbox[b] # x1 = np.around(box[0].cpu().numpy()) # y1 = np.around(box[1].cpu().numpy()) # x2 = np.around(box[2].cpu().numpy()) # y2 = np.around(box[3].cpu().numpy()) # rr, cc = rectangle_perimeter(y1, x1, y2-y1, x2-x1) # image[:, rr, cc] = 255 # writer.add_image('target boxes', image, iteration) # # Display masks # masks = targets[0].get_field('masks')[0] # masks = masks.get_mask_tensor() # combined_mask = masks[0, :, :] # for i in range(1,8): # combined_mask = combined_mask | masks[i, :, :] # writer.add_image('mask', combined_mask.unsqueeze(0)*255, iteration) # writer.add_image('single part 2', masks[1, :, :].unsqueeze(0)*255, iteration) # writer.add_image('single part 3', masks[2, :, :].unsqueeze(0)*255, iteration) # writer.add_image('single part 4', masks[3, :, :].unsqueeze(0)*255, iteration) # writer.add_image('single part 5', masks[4, :, :].unsqueeze(0)*255, iteration) # writer.add_image('single part 6', masks[5, :, :].unsqueeze(0)*255, iteration) # writer.add_image('single part 7', masks[6, :, :].unsqueeze(0)*255, iteration) # writer.add_image('single part 8', masks[7, :, :].unsqueeze(0)*255, iteration) # Display Losses writer.add_scalar('loss', meters.loss.median, iteration) writer.add_scalar('loss_classifier', loss_dict_reduced['loss_classifier'].item(), iteration) writer.add_scalar('loss_box_reg', loss_dict_reduced['loss_box_reg'].item(), iteration) writer.add_scalar('loss_objectness', loss_dict_reduced['loss_objectness'].item(), iteration) writer.add_scalar('loss_rpn_box_reg', loss_dict_reduced['loss_rpn_box_reg'].item(), iteration) writer.add_scalar('loss_mask', loss_dict_reduced['loss_mask'].item(), iteration) writer.add_scalar('loss_kpt', loss_dict_reduced['loss_kp'].item(), iteration) writer.add_scalar('lr', optimizer.param_groups[0]['lr'], iteration) # Note: If mixed precision is not used, this ends up doing nothing # Otherwise apply loss scaling for mixed-precision recipe with amp.scale_loss(losses, optimizer) as scaled_losses: scaled_losses.backward() optimizer.step() scheduler.step() batch_time = time.time() - end end = time.time() meters.update(time=batch_time, data=data_time) eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) if iteration % 20 == 0 or iteration == max_iter: logger.info( meters.delimiter.join([ "eta: {eta}", "iter: {iter}", "{meters}", "lr: {lr:.6f}", "max mem: {memory:.0f}", ]).format( eta=eta_string, iter=iteration, meters=str(meters), lr=optimizer.param_groups[0]["lr"], memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, )) if iteration % checkpoint_period == 0: checkpointer.save("model_{:07d}".format(iteration), **arguments) if data_loader_val is not None and test_period > 0 and iteration % test_period == 0: meters_val = MetricLogger(delimiter=" ") synchronize() _ = inference( # The result can be used for additional logging, e. g. for TensorBoard model, # The method changes the segmentation mask format in a data loader, # so every time a new data loader is created: make_data_loader(cfg, is_train=False, is_distributed=(get_world_size() > 1), is_for_period=True), dataset_name="[Validation]", iou_types=iou_types, box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=None, ) synchronize() model.train() with torch.no_grad(): # Should be one image for each GPU: for iteration_val, (images_val, targets_val, _) in enumerate(tqdm(data_loader_val)): images_val = images_val.to(device) targets_val = [target.to(device) for target in targets_val] loss_dict = model(images_val, targets_val) losses = sum(loss for loss in loss_dict.values()) loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum( loss for loss in loss_dict_reduced.values()) meters_val.update(loss=losses_reduced, **loss_dict_reduced) synchronize() logger.info( meters_val.delimiter.join([ "[Validation]: ", "eta: {eta}", "iter: {iter}", "{meters}", "lr: {lr:.6f}", "max mem: {memory:.0f}", ]).format( eta=eta_string, iter=iteration, meters=str(meters_val), lr=optimizer.param_groups[0]["lr"], memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, )) if iteration == max_iter: checkpointer.save("model_final", **arguments) total_training_time = time.time() - start_training_time total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / (max_iter)))
def make_data_loader_AL(cfg, is_train=True, is_distributed=False, start_iter=0, indices=None, is_passive=True): num_gpus = get_world_size() if is_train: images_per_batch = cfg.SOLVER.IMS_PER_BATCH assert (images_per_batch % num_gpus == 0 ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number " "of GPUs ({}) used.".format(images_per_batch, num_gpus) images_per_gpu = images_per_batch // num_gpus shuffle = True num_iters = cfg.SOLVER.MAX_ITER else: images_per_batch = cfg.TEST.IMS_PER_BATCH assert (images_per_batch % num_gpus == 0 ), "TEST.IMS_PER_BATCH ({}) must be divisible by the number " "of GPUs ({}) used.".format(images_per_batch, num_gpus) images_per_gpu = images_per_batch // num_gpus shuffle = False if not is_distributed else True num_iters = None start_iter = 0 if images_per_gpu > 1: logger = logging.getLogger(__name__) # logger.warning( # "When using more than one image per GPU you may encounter " # "an out-of-memory (OOM) error if your GPU does not have " # "sufficient memory. If this happens, you can reduce " # "SOLVER.IMS_PER_BATCH (for training) or " # "TEST.IMS_PER_BATCH (for inference). For training, you must " # "also adjust the learning rate and schedule length according " # "to the linear scaling rule. See for example: " # "https://github.com/facebookresearch/Detectron/blob/master/configs/getting_started/tutorial_1gpu_e2e_faster_rcnn_R-50-FPN.yaml#L14" # ) # group images which have similar aspect ratio. In this case, we only # group in two cases: those with width / height > 1, and the other way around, # but the code supports more general grouping strategy aspect_grouping = [1] if cfg.DATALOADER.ASPECT_RATIO_GROUPING else [] paths_catalog = import_file("maskrcnn_benchmark.config.paths_catalog", cfg.PATHS_CATALOG, True) # print(cfg.PATHS_CATALOG) DatasetCatalog = paths_catalog.DatasetCatalog logger = logging.getLogger(__name__) dataset_list = cfg.DATASETS.TRAIN if not is_train: a = np.arange(101) indices = np.delete(a, indices) # print() logger.info(f"At DATA LOADER indices are {indices}") transforms = build_transforms(cfg, is_train) datasets = build_dataset(dataset_list, transforms, DatasetCatalog, is_train, cfg.DATASETS.STRATEGY, indices) data_loaders = [] for dataset in datasets: sampler = make_data_sampler(dataset, shuffle, is_distributed) #TO DELETE THE LINE # print("AD DATA SAMPLES", dataset.get_indices()) batch_sampler = make_batch_data_sampler(dataset, sampler, aspect_grouping, images_per_gpu, num_iters, start_iter) collator = BatchCollator(cfg.DATALOADER.SIZE_DIVISIBILITY) num_workers = cfg.DATALOADER.NUM_WORKERS data_loader = torch.utils.data.DataLoader( dataset, num_workers=num_workers, batch_sampler=batch_sampler, collate_fn=collator, ) data_loaders.append(data_loader) if is_train: # during training, a single (possibly concatenated) data_loader is returned assert len(data_loaders) == 1 return data_loaders[0] return data_loaders
def do_train( cfg, model, data_loader, data_loader_val, optimizer, scheduler, checkpointer, device, checkpoint_period, test_period, arguments, meters, meters_val, ): logger = logging.getLogger("maskrcnn_benchmark.trainer") logger.info("Start training") # meters = MetricLogger(delimiter=" ") max_iter = len(data_loader) start_iter = arguments["iteration"] model.train() start_training_time = time.time() end = time.time() if cfg.MODEL.QRY_BALANCE: qry_cls_json_file = cfg.MODEL.QRY_INDICE_CLS with open(qry_cls_json_file, 'r') as f: batch_cls_qry = json.load(f) iou_types = ("bbox", ) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm", ) if cfg.MODEL.KEYPOINT_ON: iou_types = iou_types + ("keypoints", ) dataset_names = cfg.DATASETS.TEST rank = dist.get_rank() for iteration, (images, targets, img_id) in enumerate(data_loader, start_iter): # print(img_id) if any(len(target) < 1 for target in targets): logger.error( f"Iteration={iteration + 1} || Image Ids used for training {img_id} || targets Length={[len(target) for target in targets]}" ) continue data_time = time.time() - end scheduler.step() images = images.to(device) targets = [target.to(device) for target in targets] # print('batch_id_qry', batch_id_qry, img_id, # targets[0].extra_fields, targets[1].extra_fields) if cfg.MODEL.QRY_BALANCE: batch_id_qry = batch_cls_qry[rank][iteration * 2:iteration * 2 + 2] # print(img_id) # batch_id_qry = [batch_cls_qry[rank][iteration]] loss_dict = model(images, targets, batch_id=batch_id_qry, use_distill=cfg.MODEL.USE_DISTILL, img_id=img_id) else: loss_dict = model(images, targets, use_distill=cfg.MODEL.USE_DISTILL, img_id=img_id[0]) losses = sum(loss for loss in loss_dict.values()) iteration = iteration + 1 arguments["iteration"] = iteration # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) meters.update(iteration, loss=losses_reduced, lr=optimizer.param_groups[0]["lr"], **loss_dict_reduced) optimizer.zero_grad() # Note: If mixed precision is not used, this ends up doing nothing # Otherwise apply loss scaling for mixed-precision recipe with amp.scale_loss(losses, optimizer) as scaled_losses: scaled_losses.backward() optimizer.step() batch_time = time.time() - end end = time.time() meters.update(iteration, time=batch_time, data=data_time) eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) if iteration % 20 == 0 or iteration == max_iter: logger.info( meters.delimiter.join([ "eta: {eta}", "iter: {iter}", "{meters}", "lr: {lr:.6f}", "max mem: {memory:.0f}", ]).format( eta=eta_string, iter=iteration, meters=str(meters), lr=optimizer.param_groups[0]["lr"], memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, )) if iteration % checkpoint_period == 0: checkpointer.save("model_{:07d}".format(iteration), **arguments) if data_loader_val is not None and test_period > 0 and iteration % test_period == 0: # meters_val = MetricLogger(delimiter=" ") synchronize() torch.cuda.empty_cache() output_folder = os.path.join(cfg.OUTPUT_DIR, "Validation") mkdir(output_folder) res_infer = inference( # The result can be used for additional logging, e. g. for TensorBoard model, iteration, # The method changes the segmentation mask format in a data loader, # so every time a new data loader is created: make_data_loader(cfg, is_train=False, is_distributed=(get_world_size() > 1), is_for_period=True), dataset_name="[Validation]", iou_types=iou_types, box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=output_folder, ) # import pdb; pdb.set_trace() if res_infer: meters_val.update(iteration, **res_infer) synchronize() model.train() # the following part can be deleted # with torch.no_grad(): # # Should be one image for each GPU: # for iteration_val, (images_val, targets_val, _) in enumerate(tqdm(data_loader_val)): # images_val = images_val.to(device) # targets_val = [target.to(device) for target in targets_val] # loss_dict = model(images_val, targets_val) # losses = sum(loss for loss in loss_dict.values()) # loss_dict_reduced = reduce_loss_dict(loss_dict) # losses_reduced = sum( # loss for loss in loss_dict_reduced.values()) # meters_val.update( # iteration, loss=losses_reduced, **loss_dict_reduced) # synchronize() #### logger.info( meters_val.delimiter.join([ "[Validation]: ", "eta: {eta}", "iter: {iter}", "{meters}", "lr: {lr:.6f}", "max mem: {memory:.0f}", ]).format( eta=eta_string, iter=iteration, meters=str(meters_val), lr=optimizer.param_groups[0]["lr"], memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, )) if iteration == max_iter: checkpointer.save("model_final", **arguments) total_training_time = time.time() - start_training_time total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / (max_iter)))
def do_face_train_triplet( cfg, model, data_loader, data_loader_val, optimizer, scheduler, checkpointer, device, checkpoint_period, test_period, arguments, divs_nums, ): logger = logging.getLogger("maskrcnn_benchmark.trainer") logger.info("Start training") meters = MetricLogger(delimiter=" ") max_iter = len(data_loader) start_iter = arguments["iteration"] model.train() start_training_time = time.time() end = time.time() dataset_names = cfg.DATASETS.TEST for iteration, (img_a, img_p, img_n, label_p, label_n) in enumerate(data_loader, start_iter): data_time = time.time() - end iteration = iteration + 1 arguments["iteration"] = iteration img_a_list, _ = divs_tensors(device=device, tensors=img_a, targets=None, divs_nums=divs_nums) img_p_list, label_p_list = divs_tensors(device=device, tensors=img_p, targets=label_p, divs_nums=divs_nums) img_n_list, label_n_list = divs_tensors(device=device, tensors=img_n, targets=label_n, divs_nums=divs_nums) ####======== 拆分batch 可能对bn层有影响 ==========#### optimizer.zero_grad() for img_a, img_p, img_n, label_p, label_n in zip( img_a_list, img_p_list, img_n_list, label_p_list, label_n_list): loss_dict = model(tensors=[img_a, img_p, img_n], targets=[label_p, label_n], batch=iteration, total_batch=None) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) meters.update(loss=losses_reduced, **loss_dict_reduced) losses /= divs_nums with amp.scale_loss(losses, optimizer) as scaled_losses: scaled_losses.backward() optimizer.step() scheduler.step() batch_time = time.time() - end end = time.time() meters.update(time=batch_time, data=data_time) eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) if iteration % 20 == 0 or iteration == max_iter: logger.info( meters.delimiter.join([ "eta: {eta}", "iter: {iter}", "{meters}", "lr: {lr:.6f}", "max mem: {memory:.0f}", ]).format( eta=eta_string, iter=iteration, meters=str(meters), lr=optimizer.param_groups[0]["lr"], memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, )) if iteration % checkpoint_period == 0: checkpointer.save("model_{:07d}".format(iteration), **arguments) if iteration > 40000: checkpointer.save_backbone("BACKBONE_{:07d}".format(iteration)) #####========= data test ============####### if data_loader_val is not None and test_period > 0 and iteration % test_period == 0: meters_val = MetricLogger(delimiter=" ") synchronize() _ = inference( # The result can be used for additional logging, e. g. for TensorBoard model, # The method changes the segmentation mask format in a data loader, # so every time a new data loader is created: make_data_loader(cfg, is_train=False, is_distributed=(get_world_size() > 1), is_for_period=True), dataset_name="[Validation]", iou_types=iou_types, box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=None, ) synchronize() model.train() with torch.no_grad(): # Should be one image for each GPU: for iteration_val, (images_val, targets_val, _) in enumerate(tqdm(data_loader_val)): images_val = images_val.to(device) targets_val = [target.to(device) for target in targets_val] loss_dict = model(images_val, targets_val) losses = sum(loss for loss in loss_dict.values()) loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum( loss for loss in loss_dict_reduced.values()) meters_val.update(loss=losses_reduced, **loss_dict_reduced) synchronize() logger.info( meters_val.delimiter.join([ "[Validation]: ", "eta: {eta}", "iter: {iter}", "{meters}", "lr: {lr:.6f}", "max mem: {memory:.0f}", ]).format( eta=eta_string, iter=iteration, meters=str(meters_val), lr=optimizer.param_groups[0]["lr"], memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, )) if iteration == max_iter: checkpointer.save("model_final", **arguments) checkpointer.save_backbone("model_final") total_training_time = time.time() - start_training_time total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / (max_iter)))
def do_train( cfg, model, data_loader, data_loader_val, optimizer, scheduler, checkpointer, device, checkpoint_period, test_period, arguments, meters, ): logger = logging.getLogger("maskrcnn_benchmark.trainer") logger.info("Start training") # meters = MetricLogger(delimiter=" ") max_iter = len(data_loader) start_iter = arguments["iteration"] model.train() start_training_time = time.time() end = time.time() iou_types = ("bbox", ) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm", ) if cfg.MODEL.KEYPOINT_ON: iou_types = iou_types + ("keypoints", ) # dataset_names = cfg.DATASETS.TEST backbone_rngs, head_rngs, inter_rngs, rng, rngs = None, None, None, None, None if 'search' in cfg.MODEL.BACKBONE.CONV_BODY or \ 'search' in cfg.MODEL.ROI_MASK_HEAD.FEATURE_EXTRACTOR or \ 'search' in cfg.MODEL.SEG_BRANCH.SEGMENT_BRANCH: # synchronize rngs num_states = sum(cfg.MODEL.BACKBONE.STAGE_REPEATS) if cfg.MODEL.SEG_BRANCH.SHARE_SUBNET: head_layers = len(cfg.MODEL.ROI_MASK_HEAD.CONV_LAYERS ) + cfg.MODEL.SEG_BRANCH.SUBNET_DEPTH else: head_layers = len(cfg.MODEL.ROI_MASK_HEAD.CONV_LAYERS ) + 4 * cfg.MODEL.SEG_BRANCH.SUBNET_DEPTH inter_layers = cfg.NAS.INTER_LAYERS backbone_ss_size = len(blocks_key) head_ss_size = len(head_ss_keys) inter_ss_size = cfg.NAS.INTER_SIZE if 'search' in cfg.MODEL.BACKBONE.CONV_BODY: _lcm = backbone_ss_size * head_ss_size // math.gcd( backbone_ss_size, head_ss_size) lcm = inter_ss_size * _lcm // math.gcd(inter_ss_size, _lcm) else: lcm = inter_ss_size * head_ss_size // math.gcd( inter_ss_size, head_ss_size) # print('lcm:', lcm) fwd_idx = -1 for iteration, (images, targets, segment_target, _, img_ids, ori_sizes) in enumerate(data_loader, start_iter): if any(len(target) < 1 for target in targets): logger.error( "Iteration={iteration + 1} || Image Ids used for training {_} || targets Length={[len(target) for target in targets]}" ) continue data_time = time.time() - end iteration = iteration + 1 arguments["iteration"] = iteration images = images.to(device) targets = [target.to(device) for target in targets] if 'search' in cfg.MODEL.BACKBONE.CONV_BODY or \ 'search' in cfg.MODEL.ROI_MASK_HEAD.FEATURE_EXTRACTOR or \ 'search' in cfg.MODEL.SEG_BRANCH.SEGMENT_BRANCH: if rngs is None or iteration % lcm == 0: del rngs if 'search' in cfg.MODEL.BACKBONE.CONV_BODY: backbone_rngs = generate_rng(num_states, backbone_ss_size, lcm) head_rngs = generate_rng(head_layers, head_ss_size, lcm) inter_rngs = generate_rng(inter_layers, inter_ss_size, lcm) if 'search' in cfg.MODEL.BACKBONE.CONV_BODY: rngs = np.concatenate( [backbone_rngs, head_rngs, inter_rngs], axis=0).transpose(1, 0) del backbone_rngs else: rngs = np.concatenate([head_rngs, inter_rngs], axis=0).transpose(1, 0) del head_rngs, inter_rngs rng = rngs[iteration % lcm] rng = broadcast_data(rng.tolist()) fwd_idx = (fwd_idx + 1) % lcm loss_dict = model(images, targets, segment_target, img_ids=img_ids, c2d=None, ori_sizes=ori_sizes, rngs=rng) del rng else: loss_dict = model(images, targets, segment_target, img_ids=img_ids, c2d=None, ori_sizes=ori_sizes) if cfg.MODEL.SEG_BRANCH.ADD_SEG_BRANCH: segmentation_loss = loss_dict.pop("loss_segmentation") losses = cfg.MODEL.SEG_BRANCH.LAMDA_INSTANCE * sum( loss for loss in loss_dict.values() ) + cfg.MODEL.SEG_BRANCH.LAMDA_SEGMENTATION * segmentation_loss loss_dict[ 'loss_segmentation'] = segmentation_loss # reproduce the complete loss else: losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_loss_dict(loss_dict) meters.update(loss=losses.item(), **loss_dict_reduced) if 'search' in cfg.MODEL.BACKBONE.CONV_BODY or \ 'search' in cfg.MODEL.SEG_BRANCH.SEGMENT_BRANCH or \ 'search' in cfg.MODEL.ROI_MASK_HEAD.FEATURE_EXTRACTOR: if fwd_idx == 0: optimizer.zero_grad() losses.backward() if fwd_idx == lcm - 1: optimizer.step() else: optimizer.zero_grad() # losses.backward() # Note: If mixed precision is not used, this ends up doing nothing # Otherwise apply loss scaling for mixed-precision recipe with amp.scale_loss(losses, optimizer) as scaled_losses: scaled_losses.backward() optimizer.step() scheduler.step() batch_time = time.time() - end end = time.time() meters.update(time=batch_time, data=data_time) del loss_dict, losses, images, targets # torch.cuda.empty_cache() eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) if iteration % 20 == 0 or iteration == max_iter: logger.info( meters.delimiter.join([ "eta: {eta}", "iter: {iter}", "{meters}", "lr: {lr:.6f}", "max mem: {memory:.0f}", ]).format( eta=eta_string, iter=iteration, meters=str(meters), lr=optimizer.param_groups[0]["lr"], memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, )) if iteration % checkpoint_period == 0: checkpointer.save("model_{:07d}".format(iteration), **arguments) if data_loader_val is not None and test_period > 0 and iteration % test_period == 0: meters_val = MetricLogger(delimiter=" ") synchronize() _ = inference( # The result can be used for additional logging, e. g. for TensorBoard model, # The method changes the segmentation mask format in a data loader, # so every time a new data loader is created: make_data_loader(cfg, is_train=False, is_distributed=(get_world_size() > 1), is_for_period=True), dataset_name="[Validation]", iou_types=iou_types, box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=None, c2d_json_path=cfg.MODEL.SEG_BRANCH.JSON_PATH, cfg=cfg, ) synchronize() model.train() with torch.no_grad(): # Should be one image for each GPU: for iteration_val, (images_val, targets_val, _) in enumerate(tqdm(data_loader_val)): images_val = images_val.to(device) targets_val = [target.to(device) for target in targets_val] loss_dict = model(images_val, targets_val) # losses = sum(loss for loss in loss_dict.values()) if cfg.MODEL.SEG_BRANCH.ADD_SEG_BRANCH: segmentation_loss = loss_dict.pop("loss_segmentation") losses = cfg.MODEL.SEG_BRANCH.LAMDA_INSTANCE * sum( loss for loss in loss_dict.values() ) + cfg.MODEL.SEG_BRANCH.LAMDA_SEGMENTATION * segmentation_loss loss_dict[ 'loss_segmentation'] = segmentation_loss # reproduce the complete loss else: losses = sum(loss for loss in loss_dict.values()) loss_dict_reduced = reduce_loss_dict(loss_dict) meters_val.update(loss=losses.item(), **loss_dict_reduced) synchronize() logger.info( meters_val.delimiter.join([ "[Validation]: ", "eta: {eta}", "iter: {iter}", "{meters}", "lr: {lr:.6f}", "max mem: {memory:.0f}", ]).format( eta=eta_string, iter=iteration, meters=str(meters_val), lr=optimizer.param_groups[0]["lr"], memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, )) if iteration == max_iter: checkpointer.save("model_final", **arguments) total_training_time = time.time() - start_training_time total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / (max_iter)))
def make_data_loader(cfg, is_train=True, is_distributed=False, start_iter=0, is_for_period=False): num_gpus = get_world_size() if is_train: images_per_batch = cfg.SOLVER.IMS_PER_BATCH assert ( images_per_batch % num_gpus == 0 ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of GPUs ({}) used.".format( images_per_batch, num_gpus) images_per_gpu = images_per_batch // num_gpus shuffle = True num_iters = cfg.SOLVER.MAX_ITER else: images_per_batch = cfg.TEST.IMS_PER_BATCH assert ( images_per_batch % num_gpus == 0 ), "TEST.IMS_PER_BATCH ({}) must be divisible by the number of GPUs ({}) used.".format( images_per_batch, num_gpus) images_per_gpu = images_per_batch // num_gpus shuffle = False if not is_distributed else True num_iters = None start_iter = 0 if images_per_gpu > 1: logger = logging.getLogger(__name__) logger.warning( "When using more than one image per GPU you may encounter " "an out-of-memory (OOM) error if your GPU does not have " "sufficient memory. If this happens, you can reduce " "SOLVER.IMS_PER_BATCH (for training) or " "TEST.IMS_PER_BATCH (for inference). For training, you must " "also adjust the learning rate and schedule length according " "to the linear scaling rule. See for example: " "https://github.com/facebookresearch/Detectron/blob/master/configs/getting_started/tutorial_1gpu_e2e_faster_rcnn_R-50-FPN.yaml#L14" ) # group images which have similar aspect ratio. In this case, we only # group in two cases: those with width / height > 1, and the other way around, # but the code supports more general grouping strategy aspect_grouping = [1] if cfg.DATALOADER.ASPECT_RATIO_GROUPING else [] paths_catalog = import_file( "maskrcnn_benchmark.config.paths_catalog", cfg.PATHS_CATALOG, True ) DatasetCatalog = paths_catalog.DatasetCatalog dataset_list = cfg.DATASETS.TRAIN if is_train else cfg.DATASETS.TEST # If bbox aug is enabled in testing, simply set transforms to None and we will apply transforms later transforms = None if not is_train and cfg.TEST.BBOX_AUG.ENABLED else build_transforms(cfg, is_train) datasets = build_dataset(dataset_list, transforms, DatasetCatalog, is_train or is_for_period) if is_train: # save category_id to label name mapping save_labels(datasets, cfg.OUTPUT_DIR) data_loaders = [] for dataset in datasets: sampler = make_data_sampler(dataset, shuffle, is_distributed) batch_sampler = make_batch_data_sampler( dataset, sampler, aspect_grouping, images_per_gpu, num_iters, start_iter ) collator = BBoxAugCollator() if not is_train and cfg.TEST.BBOX_AUG.ENABLED else \ BatchCollator(cfg.DATALOADER.SIZE_DIVISIBILITY) num_workers = cfg.DATALOADER.NUM_WORKERS data_loader = torch.utils.data.DataLoader( dataset, num_workers=num_workers, batch_sampler=batch_sampler, collate_fn=collator, ) data_loaders.append(data_loader) if is_train or is_for_period: # during training, a single (possibly concatenated) data_loader is returned assert len(data_loaders) == 1 return data_loaders[0] return data_loaders
def inference( model, data_loader, dataset_name, iou_types=("bbox", ), box_only=False, device=torch.device("cuda"), expected_results=0, expected_results_sigma_tol=0, output_folder=None, cfg=None, bbox_aug=False, visualize_results=False, visualization_label="coco", only_visualization=False, ): num_devices = get_world_size() logger = logging.getLogger("maskrcnn_benchmark.inference") dataset = data_loader.dataset logger.info("Start evaluation on {} dataset({} images).".format( dataset_name, len(dataset))) total_timer = Timer() inference_timer = Timer() total_timer.tic() roi_predictions, img_predictions, attention_maps = compute_on_dataset( model, data_loader, device, bbox_aug=bbox_aug, timer=inference_timer) # wait for all processes to complete before measuring the time synchronize() total_time = total_timer.toc() total_time_str = get_time_str(total_time) logger.info( "Total run time: {} ({} s / img per device, on {} devices)".format( total_time_str, total_time * num_devices / len(dataset), num_devices)) total_infer_time = get_time_str(inference_timer.total_time) logger.info( "Model inference time: {} ({} s / img per device, on {} devices)". format( total_infer_time, inference_timer.total_time * num_devices / len(dataset), num_devices, )) if roi_predictions: roi_predictions = _accumulate_predictions_from_multiple_gpus( roi_predictions) if img_predictions: img_predictions = _accumulate_predictions_from_multiple_gpus( img_predictions) if attention_maps: attention_maps = _accumulate_predictions_from_multiple_gpus( attention_maps) if not is_main_process(): return if roi_predictions and len(roi_predictions) > 0: for prediction in roi_predictions: if prediction.has_field("pred_scores"): prediction.add_field('second_scores', prediction.get_field('pred_scores')) del prediction.extra_fields["pred_scores"] if prediction.has_field("pred_labels"): prediction.add_field('second_labels', prediction.get_field('pred_labels')) del prediction.extra_fields["pred_labels"] if output_folder: torch.save(roi_predictions, os.path.join(output_folder, "roi_predictions.pth")) print('Visualize results') if output_folder and visualize_results: categories = import_file( "maskrcnn_benchmark.data.datasets.categories.{}_categories". format(visualization_label), os.path.join( os.path.dirname(os.path.dirname(cfg.PATHS_CATALOG)), 'data', 'categories', '{}_categories.py'.format(visualization_label)), True) visualizer = Visualizer(categories=categories.CATEGORIES, cfg=cfg) visualizer.visualize_attentions( attention_maps, dataset, os.path.join(output_folder, 'attention_map')) visualizer.visualize_predictions( roi_predictions, dataset, os.path.join(output_folder, 'visualization')) if only_visualization: return extra_args = dict( box_only=box_only, iou_types=iou_types, expected_results=expected_results, expected_results_sigma_tol=expected_results_sigma_tol, ) print('ROI: Evaluate') evaluate_roi(dataset=dataset, predictions=roi_predictions, output_folder=output_folder, **extra_args) if img_predictions and len(img_predictions) > 0: if output_folder: torch.save(img_predictions, os.path.join(output_folder, "img_predictions.pth")) print('IMAGE: Evaluate') evaluate_img(dataset=dataset, predictions=img_predictions, output_folder=output_folder)