def train_one_epoch_FRCNN(model, optimizer, data_loader, device, epoch, print_freq): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) lr_scheduler = None if epoch == 0: warmup_factor = 1. / 1000 warmup_iters = min(1000, len(data_loader) - 1) lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) for images, targets in metric_logger.log_every(data_loader, print_freq, header): images = list(image.to(device) for image in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) sys.exit(1) optimizer.zero_grad() losses.backward() optimizer.step() if lr_scheduler is not None: lr_scheduler.step() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) return metric_logger
def train_one_epoch_SSD(model, loss_func, optimizer, data_loader, encoder, epoch, print_freq, mean, std, device): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) lr_scheduler = None losses = AverageMeter() if epoch == 0: warmup_factor = 1. / 1000 warmup_iters = min(1000, len(data_loader) - 1) lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) criterion = MultiBoxLoss(priors_cxcy=model.priors_cxcy).to(device) dboxes = dboxes300_coco() loss_func = Loss(dboxes) for images, targets in metric_logger.log_every(data_loader, print_freq, header): images = torch.stack(images).to(device) bboxes = [t["boxes"].to(device) for t in targets] labels = [t["labels"].to(device) for t in targets] ploc, pscores = model(images) ploc, pscores = ploc.float(), pscores.float() # loss loss = criterion(ploc, pscores, bboxes, labels) # Backward prop. optimizer.zero_grad() loss.backward() # Update model optimizer.step() losses.update(loss.item(), images.size(0)) metric_logger.update(loss=losses.val, **{}) metric_logger.update(lr=optimizer.param_groups[0]["lr"])
def main(args): utils.init_distributed_mode(args) print(args) device = torch.device(args.device) # Creating tensorboard writer if not args.resume: writer = SummaryWriter(comment=TENSORBOARD_RESULT_FILE_NAME) else: writer = SummaryWriter("") ###################### # Creating test data # ###################### print("Loading test data") viped_dataset_test = get_dataset("viped", get_transform(train=False, aug=args.aug), percentage=5, val=True) mot19_dataset_test = get_dataset("mot19", get_transform(train=False), val=True) mot17_dataset_test = get_dataset("mot17", get_transform(train=False), val=True) crowd_human_dataset_test = get_dataset("crowd_human", get_transform(train=False), val=True) city_persons_dataset_test = get_dataset("city_persons", get_transform(train=False), val=True) coco_persons_dataset_test = get_dataset("COCO_persons", get_transform(train=False), val=True) ########################## # Creating training data # ########################## print("Loading training data") train_datasets_dict = { 'viped': lambda: get_dataset("viped", get_transform(train=True, aug=args.aug)), 'mot19': lambda: get_dataset("mot19", get_transform(train=True)), 'mot17': lambda: get_dataset("mot17", get_transform(train=True)), 'crowd_human': lambda: get_dataset("crowd_human", get_transform(train=True)), 'city_persons': lambda: get_dataset("city_persons", get_transform(train=True)), 'COCO_persons:': lambda: get_dataset("COCO_persons", get_transform(train=True)), } ################################# # Preparing training dataloader # ################################# if args.train_on in train_datasets_dict: # the train dataset is a normal single dataset train_dataset = train_datasets_dict[args.train_on]() train_dataloader = DataLoader( train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, collate_fn=train_dataset.standard_collate_fn) print('Using training dataset: {}'.format(args.train_on)) elif ',' in args.train_on: assert args.tgt_images_in_batch > 0, "Using mixed training. " \ "You need to specify the args.tgt_images_in_batch parameter!" # the train dataset is an ensamble of datasets source_dataset_name, target_dataset_name = args.train_on.split(',') train_dataset = DatasetsEnsemble( train_datasets_dict[source_dataset_name](), train_datasets_dict[target_dataset_name]()) train_dataloader = DataLoader( train_dataset, collate_fn=train_dataset.source_dataset.standard_collate_fn, num_workers=args.workers, batch_sampler=EnsembleBatchSampler( train_dataset, batch_size=args.batch_size, shuffle=True, tgt_imgs_in_batch=args.tgt_images_in_batch)) print( 'Using mixed training datasets. Source: {}, Target: {}. In every batch, {}/{} are from {}' .format(source_dataset_name, target_dataset_name, args.tgt_images_in_batch, args.batch_size, target_dataset_name)) else: raise ValueError('Dataset not known!') ############################## # Preparing test dataloaders # ############################## data_loader_viped_test = DataLoader( viped_dataset_test, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, collate_fn=viped_dataset_test.standard_collate_fn) data_loader_mot19_test = DataLoader( mot19_dataset_test, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, collate_fn=mot19_dataset_test.standard_collate_fn) data_loader_mot17_test = DataLoader( mot17_dataset_test, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, collate_fn=mot17_dataset_test.standard_collate_fn) data_loader_crowd_human_test = DataLoader( crowd_human_dataset_test, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, collate_fn=crowd_human_dataset_test.standard_collate_fn) data_loader_city_persons_test = DataLoader( city_persons_dataset_test, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, collate_fn=city_persons_dataset_test.standard_collate_fn) data_loader_coco_persons_test = DataLoader( coco_persons_dataset_test, shuffle=False, num_workers=args.workers, collate_fn=coco_persons_dataset_test.standard_collate_fn) # Creating model print("Creating model") model, backbone = get_model_detection(num_classes=1, model=args.model, pretrained=args.pretrained) # Putting model to device and setting eval mode model.to(device) model.train() # freeze the backbone parameters, if needed if backbone is not None and args.freeze_backbone: for param in backbone.parameters(): param.requires_grad = False print('Backbone is freezed!') # construct an optimizer params = [p for p in model.parameters() if p.requires_grad] if args.optimizer == "sgd": optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer == "adam": optimizer = torch.optim.Adam( params=params, lr=args.lr, ) else: print("Optimizer not available") exit(1) # and a learning rate scheduler if args.lr_scheduler == "step_lr": lr_scheduler = torch.optim.lr_scheduler.StepLR( optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma) elif args.lr_scheduler == "plateau": lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='max', patience=args.lr_patience, verbose=True) else: print("L-Scheduler not available") exit(1) # Defining a warm-uo lr scheduler warmup_iters = min(1000, len(train_dataloader) - 1) warmup_factor = 1. / 1000 warmup_lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) # Loading checkpoint start_epoch = 0 train_step = -1 best_viped_ap, best_mot19_ap, best_mot17_ap, best_crowdhuman_ap, best_citypersons_ap, best_cocopersons_ap \ = 0, 0, 0, 0, 0, 0 if args.resume: print("Resuming from checkpoint") checkpoint = torch.load(args.resume) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) warmup_lr_scheduler.load_state_dict(checkpoint['warmup_lr_scheduler']) start_epoch = checkpoint['epoch'] train_step = checkpoint['iteration'] best_viped_ap = checkpoint['best_viped_ap'] best_mot19_ap = checkpoint['best_mot19_ap'] best_mot17_ap = checkpoint['best_mot17_ap'] best_crowdhuman_ap = checkpoint['best_crowdhuman_ap'] best_citypersons_ap = checkpoint['best_citypersons_ap'] best_cocopersons_ap = checkpoint['best_cocopersons_ap'] # Cross-check if the backbone has been really freezed if backbone is not None and args.freeze_backbone: for param in backbone.parameters(): assert not param.requires_grad, "Backbone seems to be not freezed correctly!" # Train print("Start training") for epoch in range(start_epoch, args.epochs): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) for images, targets in metric_logger.log_every( train_dataloader, print_freq=args.print_freq, header=header): train_step += 1 images = list(image.to(device) for image in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) sys.exit(1) optimizer.zero_grad() losses.backward() # clip norm torch.nn.utils.clip_grad_norm(model.parameters(), 50) optimizer.step() if epoch == 0 and train_step < warmup_iters: warmup_lr_scheduler.step() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) if train_step % args.log_loss == 0: writer.add_scalar('Training/Learning Rate', optimizer.param_groups[0]["lr"], train_step) writer.add_scalar('Training/Reduced Sum Losses', losses_reduced, train_step) writer.add_scalars('Training/All Losses', loss_dict, train_step) if (train_step % args.save_freq == 0 and train_step != 0) or \ (args.pretrained and train_step < 5*args.save_freq and train_step % 200 == 0 and train_step != 0) \ or train_step == 100: # evaluate on the test datasets print("Validation viped Dataset") viped_coco_evaluator = evaluate(model, data_loader_viped_test, device=device, max_dets=args.max_dets) print("Validation mot19 Dataset") mot19_coco_evaluator = evaluate(model, data_loader_mot19_test, device=device, max_dets=args.max_dets) print("Validation mot17 Dataset") mot17_coco_evaluator = evaluate(model, data_loader_mot17_test, device=device, max_dets=args.max_dets) print("Validation crowdhuman Dataset") crowdhuman_coco_evaluator = evaluate( model, data_loader_crowd_human_test, device=device, max_dets=args.max_dets) print("Validation citypersons Dataset") citypersons_coco_evaluator = evaluate( model, data_loader_city_persons_test, device=device, max_dets=args.max_dets) print("Validation COCO Persons Dataset") cocopersons_coco_evaluator = evaluate( model, data_loader_coco_persons_test, device=device, max_dets=args.max_dets) # save using tensorboard viped_ap, mot19_ap, mot17_ap, crowdhuman_ap, citypersons_ap, cocopersons_ap = \ None, None, None, None, None, None for iou_type, coco_eval in viped_coco_evaluator.coco_eval.items( ): viped_ap = coco_eval.stats[1] for iou_type, coco_eval in mot19_coco_evaluator.coco_eval.items( ): mot19_ap = coco_eval.stats[1] for iou_type, coco_eval in mot17_coco_evaluator.coco_eval.items( ): mot17_ap = coco_eval.stats[1] for iou_type, coco_eval in crowdhuman_coco_evaluator.coco_eval.items( ): crowdhuman_ap = coco_eval.stats[1] for iou_type, coco_eval in citypersons_coco_evaluator.coco_eval.items( ): citypersons_ap = coco_eval.stats[1] for iou_type, coco_eval in cocopersons_coco_evaluator.coco_eval.items( ): cocopersons_ap = coco_eval.stats[1] writer.add_scalar('COCO mAP Validation/ViPeD', viped_ap, train_step) writer.add_scalar('COCO mAP Validation/MOT19', mot19_ap, train_step) writer.add_scalar('COCO mAP Validation/MOT17', mot17_ap, train_step) writer.add_scalar('COCO mAP Validation/CrowdHuman', crowdhuman_ap, train_step) writer.add_scalar('COCO mAP Validation/CityPersons', citypersons_ap, train_step) writer.add_scalar('COCO mAP Validation/COCOPersons', cocopersons_ap, train_step) # Eventually saving best models if viped_ap > best_viped_ap: best_viped_ap = viped_ap save_checkpoint( { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'warmup_lr_scheduler': warmup_lr_scheduler.state_dict() if warmup_lr_scheduler is not None else None, 'epoch': epoch, 'iteration': train_step, 'best_viped_ap': best_viped_ap, 'best_mot19_ap': best_mot19_ap, 'best_mot17_ap': best_mot17_ap, 'best_crowdhuman_ap': best_crowdhuman_ap, 'best_citypersons_ap': best_citypersons_ap, 'best_cocopersons_ap': best_cocopersons_ap, }, writer.get_logdir(), best_model="viped") if mot19_ap > best_mot19_ap: best_mot19_ap = mot19_ap save_checkpoint( { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'warmup_lr_scheduler': warmup_lr_scheduler.state_dict() if warmup_lr_scheduler is not None else None, 'epoch': epoch, 'iteration': train_step, 'best_viped_ap': best_viped_ap, 'best_mot19_ap': best_mot19_ap, 'best_mot17_ap': best_mot17_ap, 'best_crowdhuman_ap': best_crowdhuman_ap, 'best_citypersons_ap': best_citypersons_ap, 'best_cocopersons_ap': best_cocopersons_ap, }, writer.get_logdir(), best_model="mot19") if mot17_ap > best_mot17_ap: best_mot17_ap = mot17_ap save_checkpoint( { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'warmup_lr_scheduler': warmup_lr_scheduler.state_dict() if warmup_lr_scheduler is not None else None, 'epoch': epoch, 'iteration': train_step, 'best_viped_ap': best_viped_ap, 'best_mot19_ap': best_mot19_ap, 'best_mot17_ap': best_mot17_ap, 'best_crowdhuman_ap': best_crowdhuman_ap, 'best_citypersons_ap': best_citypersons_ap, 'best_cocopersons_ap': best_cocopersons_ap, }, writer.get_logdir(), best_model="mot17") if crowdhuman_ap > best_crowdhuman_ap: best_crowdhuman_ap = crowdhuman_ap save_checkpoint( { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'warmup_lr_scheduler': warmup_lr_scheduler.state_dict() if warmup_lr_scheduler is not None else None, 'epoch': epoch, 'iteration': train_step, 'best_viped_ap': best_viped_ap, 'best_mot19_ap': best_mot19_ap, 'best_mot17_ap': best_mot17_ap, 'best_crowdhuman_ap': best_crowdhuman_ap, 'best_citypersons_ap': best_citypersons_ap, 'best_cocopersons_ap': best_cocopersons_ap, }, writer.get_logdir(), best_model="crowdhuman") if citypersons_ap > best_citypersons_ap: best_citypersons_ap = citypersons_ap save_checkpoint( { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'warmup_lr_scheduler': warmup_lr_scheduler.state_dict() if warmup_lr_scheduler is not None else None, 'epoch': epoch, 'iteration': train_step, 'best_viped_ap': best_viped_ap, 'best_mot19_ap': best_mot19_ap, 'best_mot17_ap': best_mot17_ap, 'best_crowdhuman_ap': best_crowdhuman_ap, 'best_citypersons_ap': best_citypersons_ap, 'best_cocopersons_ap': best_cocopersons_ap, }, writer.get_logdir(), best_model="citypersons") # Saving model save_checkpoint( { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'warmup_lr_scheduler': warmup_lr_scheduler.state_dict() if warmup_lr_scheduler is not None else None, 'epoch': epoch, 'iteration': train_step, 'best_viped_ap': best_viped_ap, 'best_mot19_ap': best_mot19_ap, 'best_mot17_ap': best_mot17_ap, 'best_crowdhuman_ap': best_crowdhuman_ap, 'best_citypersons_ap': best_citypersons_ap, 'best_cocopersons_ap': best_cocopersons_ap, }, writer.get_logdir()) # Setting again to train mode model.train() lr_scheduler.step()
def main(args): utils.init_distributed_mode(args) print(args) # Opening YAML cfg config file with open(args.cfg_file, 'r') as stream: try: cfg_file = yaml.safe_load(stream) except yaml.YAMLError as exc: print(exc) # Retrieving cfg train_cfg = cfg_file['training'] model_cfg = cfg_file['model'] data_cfg = cfg_file['dataset'] # Setting device device = torch.device(model_cfg['device']) # No possible to set checkpoint and pre-trained model at the same time if train_cfg['checkpoint'] and train_cfg['pretrained_model']: print("You can't set checkpoint and pretrained-model at the same time") exit(1) # Creating tensorboard writer if train_cfg['checkpoint']: checkpoint = torch.load(train_cfg['checkpoint']) writer = SummaryWriter(log_dir=checkpoint['tensorboard_working_dir']) else: writer = SummaryWriter(comment="_" + train_cfg['tensorboard_filename']) # Saving cfg file in the same folder copyfile( args.cfg_file, os.path.join(writer.get_logdir(), os.path.basename(args.cfg_file))) ####################### # Creating model ####################### print("Creating model") load_custom_model = False if train_cfg['checkpoint'] or train_cfg['pretrained_model']: load_custom_model = True model, backbone = get_model_detection(num_classes=1, cfg=model_cfg, load_custom_model=load_custom_model) # Putting model to device and setting eval mode model.to(device) model.train() # Freeze the backbone parameters, if needed if backbone is not None and model_cfg['freeze_backbone']: for param in backbone.parameters(): param.requires_grad = False print('Backbone is freezed!') ##################################### # Creating datasets and dataloaders ##################################### data_root = data_cfg['root'] ################################ # Creating training datasets and dataloaders print("Loading training data") train_datasets_names = data_cfg['train'] if train_cfg['mixed_batch']: assert train_cfg['tgt_images_in_batch'] > 0, \ "Using mixed training. You need to specify the tgt_images_in_batch parameter!" assert len(train_datasets_names) == 2, "Using mixed training, you need to specify two datasets, " \ "the first one as the source while the second as the target" source_dataset = CustomYoloAnnotatedDataset( data_root, { list(train_datasets_names.keys())[0]: list(train_datasets_names.values())[0] }, transforms=get_transform(train=True), phase='train') target_dataset = CustomYoloAnnotatedDataset( data_root, { list(train_datasets_names.keys())[1]: list(train_datasets_names.values())[1] }, transforms=get_transform(train=True), phase='train') train_dataset = DatasetsEnsemble(source_dataset=source_dataset, target_dataset=target_dataset) train_dataloader = DataLoader( train_dataset, collate_fn=train_dataset.source_dataset.standard_collate_fn, num_workers=train_cfg['num_workers'], batch_sampler=EnsembleBatchSampler( train_dataset, batch_size=train_cfg['batch_size'], shuffle=True, tgt_imgs_in_batch=train_cfg['tgt_images_in_batch'])) print( 'Using mixed training datasets. Source: {}, Target: {}. In every batch, {}/{} are from {}' .format( list(train_datasets_names.keys())[0], list(train_datasets_names.keys())[1], train_cfg['tgt_images_in_batch'], train_cfg['batch_size'], list(train_datasets_names.keys())[1])) else: train_dataset = CustomYoloAnnotatedDataset( data_root, train_datasets_names, transforms=get_transform(train=True), phase='train') train_dataloader = DataLoader( train_dataset, batch_size=train_cfg['batch_size'], shuffle=False, num_workers=train_cfg['num_workers'], collate_fn=train_dataset.standard_collate_fn) ############################### # Creating validation datasets print("Loading validation data") val_datasets_names = data_cfg['val'] # Creating dataset(s) and dataloader(s) val_dataloaders = dict() best_validation_ap = defaultdict(float) for dataset_name, dataset_cfg in val_datasets_names.items(): val_dataset = CustomYoloAnnotatedDataset( data_root, {dataset_name: dataset_cfg}, transforms=get_transform(), phase="val", percentage=train_cfg["percentage_val"]) val_dataloader = DataLoader(val_dataset, batch_size=train_cfg['batch_size'], shuffle=False, num_workers=train_cfg['num_workers'], collate_fn=val_dataset.standard_collate_fn) # Adding created dataloader val_dataloaders[dataset_name] = val_dataloader # Initializing best validation ap value best_validation_ap[dataset_name] = 0.0 ####################################### # Defining optimizer and LR scheduler ####################################### ########################## # Constructing an optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD( params, lr=train_cfg['lr'], momentum=train_cfg['momentum'], weight_decay=train_cfg['weight_decay'], ) # and a learning rate scheduler if model_cfg['coco_model_pretrained']: lr_step_size = min(25000, len(train_dataset)) else: lr_step_size = min(40000, 2 * len(train_dataset)) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=lr_step_size, gamma=train_cfg['lr_gamma']) # Defining a warm-up lr scheduler warmup_iters = min(1000, len(train_dataloader) - 1) warmup_factor = 1. / 1000 warmup_lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) ############################# # Resuming a model ############################# start_epoch = 0 train_step = -1 # Eventually resuming a pre-trained model if train_cfg['pretrained_model']: print("Resuming pre-trained model") if train_cfg['pretrained_model'].startswith('http://') or train_cfg[ 'pretrained_model'].startswith('https://'): pre_trained_model = torch.hub.load_state_dict_from_url( train_cfg['pretrained_model'], map_location='cpu', model_dir=model_cfg["cache_folder"]) else: pre_trained_model = torch.load(train_cfg['pretrained_model'], map_location='cpu') model.load_state_dict(pre_trained_model['model']) # Eventually resuming from a saved checkpoint if train_cfg['checkpoint']: print("Resuming from a checkpoint") checkpoint = torch.load(train_cfg['checkpoint']) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) warmup_lr_scheduler.load_state_dict(checkpoint['warmup_lr_scheduler']) start_epoch = checkpoint['epoch'] train_step = checkpoint['iteration'] for elem_name, elem in checkpoint.items(): if elem_name.startswith("best_"): d_name = elem_name.split("_")[1] if d_name in best_validation_ap: best_validation_ap[d_name] = elem else: warnings.warn( "The dataset {} was not used in the previous training". format(d_name)) best_validation_ap[d_name] = 0.0 ################ ################ # Training print("Start training") for epoch in range(start_epoch, train_cfg['epochs']): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) for images, targets in metric_logger.log_every( train_dataloader, print_freq=train_cfg['print_freq'], header=header): train_step += 1 images = list(image.to(device) for image in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() if not math.isfinite(loss_value): for target in targets: image_id = target['image_id'].item() print(train_dataset.images[image_id]) print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) sys.exit(1) optimizer.zero_grad() losses.backward() # clip norm torch.nn.utils.clip_grad_norm_(model.parameters(), 50) optimizer.step() if epoch == 0 and train_step < warmup_iters: warmup_lr_scheduler.step() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) if train_step % train_cfg['log_loss'] == 0: writer.add_scalar('Training/Learning Rate', optimizer.param_groups[0]["lr"], train_step) writer.add_scalar('Training/Reduced Sum Losses', losses_reduced, train_step) writer.add_scalars('Training/All Losses', loss_dict, train_step) if (train_step % train_cfg['save_freq'] == 0 and train_step != 0) \ or ((train_cfg['pretrained_model'] or model_cfg['coco_model_pretrained']) and train_step < 6 * train_cfg['save_freq'] and train_step % 200 == 0 and train_step != 0): # Validation for val_name, val_dataloader in val_dataloaders.items(): print("Validation on {}".format(val_name)) coco_evaluator = evaluate( model, val_dataloader, device=device, max_dets=model_cfg["max_dets_per_image"]) ap = None for iou_type, coco_eval in coco_evaluator.coco_eval.items( ): ap = coco_eval.stats[1] writer.add_scalar( 'COCO mAP Validation/{}'.format(val_name), ap, train_step) # Eventually saving best model if ap > best_validation_ap[val_name]: best_validation_ap[val_name] = ap save_checkpoint( { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'warmup_lr_scheduler': warmup_lr_scheduler.state_dict() if warmup_lr_scheduler is not None else None, 'epoch': epoch, 'iteration': train_step, 'best_{}_ap'.format(val_name): best_validation_ap[val_name], }, writer.get_logdir(), best_model=val_name) # Saving last model checkpoint_dict = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'warmup_lr_scheduler': warmup_lr_scheduler.state_dict() if warmup_lr_scheduler is not None else None, 'epoch': epoch, 'iteration': train_step, 'tensorboard_working_dir': writer.get_logdir(), } for d_name, _ in val_dataloaders.items(): checkpoint_dict["best_{}_ap".format( d_name)] = best_validation_ap[d_name] save_checkpoint(checkpoint_dict, writer.get_logdir()) # Setting again to train mode model.train() # Updating lr scheduler lr_scheduler.step()