def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url( args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) # standard PyTorch mean-std input image normalization transform = T.Compose([ T.Resize(800), T.ToTensor(), T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) pimg = Image.open(args.source).convert('RGB') t0 = time.time() scores, boxes = detect(pimg, model, transform, device) print(f'inference use {time.time() - t0} seconds.') plot_results(pimg, scores, boxes)
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) # dataset_train = build_dataset(image_set='train', args=args) dataset_val = build_dataset(image_set='val', args=args) # if args.distributed: # sampler_train = DistributedSampler(dataset_train) # sampler_val = DistributedSampler(dataset_val, shuffle=False) # else: # sampler_train = torch.utils.data.RandomSampler(dataset_train) # sampler_val = torch.utils.data.SequentialSampler(dataset_val) # batch_sampler_train = torch.utils.data.BatchSampler( # sampler_train, args.batch_size, drop_last=True) sampler_val = torch.utils.data.SequentialSampler(dataset_val) # data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, # collate_fn=utils.collate_fn, num_workers=args.num_workers) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn_leimao, num_workers=args.num_workers) for inputs, labels in data_loader_val: print("---------------------") print(inputs.shape) print(labels)
def load_model(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) # IPython.embed() # IPython.embed() os.system("sudo chmod -R 777 /home/shuxuang/.cache/") model, criterion, postprocessors = build_model( args) # use the same model as detr paper on coco model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) # optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, # weight_decay=args.weight_decay) # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) # dataset_train = build_dataset(image_set='train', args=args) # dataset_val = build_dataset(image_set='val', args=args) # modify the dataset from coco to nvdata # home_dir = os.environ["HOME"] # # on local # dataset_train_ = build_nvdataset(dataset_root=[ # os.path.join(os.environ["HOME"],'datasets/annotation_sql_nvidia'), # os.path.join(os.environ["HOME"], 'datasets/frames_nvidia')], # mode='train') dataset_val = build_nvdataset(dataset_root=[ os.path.join(os.environ["HOME"], 'datasets/test'), os.path.join(os.environ["HOME"], 'datasets/frames_nvidia') ], mode='test') # indices_50k =np.load(os.path.join(os.environ["HOME"],'datasets/id_1_criterion_Max_SSD_num_labels_50000.npy')) # # on maglev # dataset_train_ = build_nvdataset(dataset_root=[args.dataset_root_sql, args.dataset_root_img], # mode='train') # dataset_val = build_nvdataset(dataset_root=[args.dataset_root_test, args.dataset_root_sql], # mode='test') # indices_50k =np.load(os.path.join(args.root_indices)) # dataset_train = Subset(dataset_train_, indices_50k) print("Validation samples: %d" % (len(dataset_val))) # IPython.embed() if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) # args.resume = os.path.join(os.environ["HOME"], 'datasets/exps_detr_base/checkpoint0299.pth') args.resume = '/home/shuxuang/datasets/exps_detr_base/checkpoint0299.pth' if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: print('Loading model: %s' % args.resume) checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) # if args.eval: # evaluate(model, dataset_val, postprocessors, device) return model, dataset_val, postprocessors, device # if __name__ == '__main__': # parser = argparse.ArgumentParser('DETR training and evaluation script', parents=[get_args_parser()]) # args = parser.parse_args() # if args.output_dir: # Path(args.output_dir).mkdir(parents=True, exist_ok=True) # main(args) # CUDA_VISIBLE_DEVICES=1 dazel run //sandbox/williamz/detr:train_with_eval -- --eval # test: # CUDA_VISIBLE_DEVICES=1 dazel run //sandbox/williamz/detr:eval -- --eval --resume /home/shuxuang/experiments/train_output/checkpoint.pth # get info: # maglev workflows get 0cf25940-3f00-5c2c-a8e8-1571e986513b # maglev volumes mount -n train-outputs -v 4977fea-0998-4d5b-b557-ff17605f2098 -p /home/shuxuang/experiments/train_output
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) # Save our Wandb metadata if not args.no_wb: wandb.init(entity='dl-project', project='dl-final-project', name=args.wb_name, notes=args.wb_notes, reinit=True) wandb.config.epochs = args.epochs device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) # visualize_video(model, postprocessors) model_without_ddp = model n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of trainable params:', n_parameters) wandb.config.n_parameters = n_parameters wandb.config.n_trainable_parameters = n_parameters # better name # Log total # of model parameters (including frozen) to W&B n_total_parameters = sum(p.numel() for p in model.parameters()) print('total number of parameters:', n_total_parameters) wandb.config.n_total_parameters = n_total_parameters dataset_train = build_dataset(image_set='train', args=args) dataset_val = build_dataset(image_set='val', args=args) # For visualization we want the raw images without any normalization or random resizing dataset_val_without_resize = CocoDetection( "data/coco/val2017", annFile="data/coco/annotations/instances_val2017.json", transforms=T.Compose([T.ToTensor()])) # Save metadata about training + val datasets and batch size wandb.config.len_dataset_train = len(dataset_train) wandb.config.len_dataset_val = len(dataset_val) wandb.config.batch_size = args.batch_size if args.distributed: if args.cache_mode: sampler_train = samplers.NodeDistributedSampler(dataset_train) sampler_val = samplers.NodeDistributedSampler(dataset_val, shuffle=False) else: sampler_train = samplers.DistributedSampler(dataset_train) sampler_val = samplers.DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True) # lr_backbone_names = ["backbone.0", "backbone.neck", "input_proj", "transformer.encoder"] def match_name_keywords(n, name_keywords): out = False for b in name_keywords: if b in n: out = True break return out for n, p in model_without_ddp.named_parameters(): print(n) param_dicts = [{ "params": [ p for n, p in model_without_ddp.named_parameters() if not match_name_keywords(n, args.lr_backbone_names) and not match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad ], "lr": args.lr, }, { "params": [ p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_backbone_names) and p.requires_grad ], "lr": args.lr_backbone, }, { "params": [ p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad ], "lr": args.lr * args.lr_linear_proj_mult, }] # Not sure if we should save all hyperparameters in wandb.config? # just start with a few important ones wandb.config.lr = args.lr wandb.config.lr_backbone = args.lr_backbone if args.sgd: optimizer = torch.optim.SGD(param_dicts, lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) else: optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') missing_keys, unexpected_keys = model_without_ddp.load_state_dict( checkpoint['model'], strict=False) unexpected_keys = [ k for k in unexpected_keys if not (k.endswith('total_params') or k.endswith('total_ops')) ] if len(missing_keys) > 0: print('Missing Keys: {}'.format(missing_keys)) if len(unexpected_keys) > 0: print('Unexpected Keys: {}'.format(unexpected_keys)) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: import copy p_groups = copy.deepcopy(optimizer.param_groups) optimizer.load_state_dict(checkpoint['optimizer']) for pg, pg_old in zip(optimizer.param_groups, p_groups): pg['lr'] = pg_old['lr'] pg['initial_lr'] = pg_old['initial_lr'] # print(optimizer.param_groups) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) # todo: this is a hack for doing experiment that resume from checkpoint and also modify lr scheduler (e.g., decrease lr in advance). args.override_resumed_lr_drop = True if args.override_resumed_lr_drop: print( 'Warning: (hack) args.override_resumed_lr_drop is set to True, so args.lr_drop would override lr_drop in resumed lr_scheduler.' ) lr_scheduler.step_size = args.lr_drop lr_scheduler.base_lrs = list( map(lambda group: group['initial_lr'], optimizer.param_groups)) lr_scheduler.step(lr_scheduler.last_epoch) args.start_epoch = checkpoint['epoch'] + 1 # check the resumed model if not args.eval: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.eval: print("Generating visualizations...") visualize_bbox(model, postprocessors, data_loader_val, device, dataset_val_without_resize) test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_file_for_wb = str( output_dir / f'{wandb.run.id}_checkpoint{epoch:04}.pth') checkpoint_paths = [ output_dir / 'checkpoint.pth', checkpoint_file_for_wb ] # extra checkpoint before LR drop and every 5 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 5 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) # Save model checkpoint to W&B wandb.save(checkpoint_file_for_wb) # Generate visualizations for fixed(?) set of images every epoch print("Generating visualizations...") visualize_bbox(model, postprocessors, data_loader_val, device, dataset_val_without_resize) test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } # Save the COCO metrics properly metric_name = [ "AP", "AP50", "AP75", "APs", "APm", "APl", "AR@1", "AR@10", "AR@100", "ARs", "ARm", "ARl" ] for i, metric_val in enumerate(log_stats["test_coco_eval_bbox"]): log_stats[metric_name[i]] = metric_val if not args.no_wb: wandb.log(log_stats) print("train_loss: ", log_stats['train_loss']) if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") wandb.save(str(output_dir / "log.txt")) # for evaluation logs if coco_evaluator is not None: (output_dir / 'eval').mkdir(exist_ok=True) if "bbox" in coco_evaluator.coco_eval: filenames = ['latest.pth'] eval_filename_for_wb = f'{wandb.run.id}_eval_{epoch:04}.pth' eval_path_for_wb = str(output_dir / "eval" / eval_filename_for_wb) filenames = ['latest.pth', eval_filename_for_wb] if epoch % 50 == 0: filenames.append(f'{epoch:03}.pth') for name in filenames: torch.save(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval" / name) # TODO not sure if this file will end up being too big # I think it's the COCO precision/recall metrics # in some format... # let's track it just in case to start! wandb.save(eval_path_for_wb) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) if args.output_dir is None: args.output_dir = os.path.expanduser( '~/Data/AI2Thor_detection_features/') if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) dataset_all = build_dataset(image_set='all', args=args) if args.distributed: sampler_all = DistributedSampler(dataset_all, shuffle=False) else: sampler_all = torch.utils.data.SequentialSampler(dataset_all) data_loader_all = DataLoader(dataset_all, args.batch_size, sampler=sampler_all, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) base_ds = get_coco_api_from_dataset(dataset_all) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model'], strict=False) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 print("Start extracting features") start_time = time.time() extract_feature(model, criterion, postprocessors, data_loader_all, base_ds, device, args.output_dir) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Extracting features time {}'.format(total_time_str)) print('Start combining files') start_time = time.time() data_dir = os.path.expanduser('~/Data/AI2Thor_offline_data_2.0.2/') combine_files(args, data_dir) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Combining files time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) print(args) device = torch.device(args.device) # Fix the seed for reproducibility. seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion = build_model(args) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], find_unused_parameters=True) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ {"params": [p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad]}, { "params": [p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) dataset_train = build_dataset(image_set='train', args=args) if args.distributed: sampler_train = DistributedSampler(dataset_train) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) # Load from pretrained DETR model. assert args.num_queries == 100, args.num_queries assert args.enc_layers == 6 and args.dec_layers == 6 assert args.backbone in ['resnet50', 'resnet101', 'swin'], args.backbone if args.backbone == 'resnet50': pretrain_model = './data/detr_coco/detr-r50-e632da11.pth' elif args.backbone == 'resnet101': pretrain_model = './data/detr_coco/detr-r101-2c7b67e5.pth' else: pretrain_model = None if pretrain_model is not None: pretrain_dict = torch.load(pretrain_model, map_location='cpu')['model'] my_model_dict = model_without_ddp.state_dict() pretrain_dict = {k: v for k, v in pretrain_dict.items() if k in my_model_dict} my_model_dict.update(pretrain_dict) model_without_ddp.load_state_dict(my_model_dict) output_dir = Path(args.output_dir) if args.resume: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch( model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 10 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') if (epoch + 1) > args.lr_drop and (epoch + 1) % 10 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master({ 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters} if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) # IPython.embed() # IPython.embed() # os.system("sudo chmod -R 777 /home/shuxuang/.cache/") model, criterion, postprocessors = build_model( args) # use the same model as detr paper on coco model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) # dataset_train = build_dataset(image_set='train', args=args) # dataset_val = build_dataset(image_set='val', args=args) # modify the dataset from coco to nvdata # home_dir = os.environ["HOME"] # dataset_train_ = build_nvdataset(dataset_root=[ # os.path.join(os.environ["HOME"],'datasets/annotation_sql_nvidia'), # os.path.join(os.environ["HOME"], 'datasets/frames_nvidia')], # mode='train') # dataset_val = build_nvdataset(dataset_root=[ # os.path.join(os.environ["HOME"],'datasets/test'), # os.path.join(os.environ["HOME"], 'datasets/frames_nvidia')], # mode='test') # indices_50k =np.load(os.path.join(os.environ["HOME"],'datasets/id_1_criterion_Max_SSD_num_labels_50000.npy')) dataset_train = build_nvdataset( dataset_root=[args.dataset_root_sql, args.dataset_root_img], mode='train', camera=args.camera) dataset_val = build_nvdataset( dataset_root=[args.dataset_root_test, args.dataset_root_test], mode='test', camera=args.camera) if args.root_indices is not None: indices_50k = np.load(os.path.join(args.root_indices)) # indices_50k =np.load(os.path.join(os.environ["HOME"],'datasets/id_1_criterion_Max_SSD_num_labels_50000.npy')) dataset_train = Subset(dataset_train, indices_50k) # IPython.embed() print("Train samples: %d" % (len(dataset_train))) if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) # if args.dataset_file == "coco_panoptic": # # We also evaluate AP during panoptic training, on original coco DS # coco_val = datasets.coco.build("val", args) # base_ds = get_coco_api_from_dataset(coco_val) # elif args.dataset_file == "nvdata": # coco_val = datasets.coco.build("val", args) # base_ds = get_coco_api_from_dataset(coco_val) # else: # base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 # if args.eval: # test_stats, coco_evaluator = evaluate_nvdata(model, criterion, postprocessors, # data_loader_val, base_ds, device, args.output_dir) # if args.output_dir: # utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") # return # if args.eval: # evaluate(model, dataset_val, postprocessors, device) print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 50 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) # test_stats, coco_evaluator = evaluate_nvdata( # model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir # ) # log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, # **{f'test_{k}': v for k, v in test_stats.items()}, # 'epoch': epoch, # 'n_parameters': n_parameters} log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # for evaluation logs # if coco_evaluator is not None: # (output_dir / 'eval').mkdir(exist_ok=True) # if "bbox" in coco_evaluator.coco_eval: # filenames = ['latest.pth'] # if epoch % 50 == 0: # filenames.append(f'{epoch:03}.pth') # for name in filenames: # torch.save(coco_evaluator.coco_eval["bbox"].eval, # output_dir / "eval" / name) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) wandb.init(project="qpic-project", entity="sangbaeklee", group="experiment_qpic") wandb.config = { "learning_rate": args.lr, "epochs": args.epochs, "batch_size": args.batch_size, } if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) wandb.watch(model) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) dataset_train = build_dataset(image_set='train', args=args) dataset_val = build_dataset(image_set='val', args=args) if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) if not args.hoi: if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 elif args.pretrained: checkpoint = torch.load(args.pretrained, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model'], strict=False) if args.eval: if args.hoi: test_stats = evaluate_hoi(args.dataset_file, model, postprocessors, data_loader_val, args.subject_category_id, device) return else: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) if args.hoi: test_stats = evaluate_hoi(args.dataset_file, model, postprocessors, data_loader_val, args.subject_category_id, device) coco_evaluator = None else: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } #import pdb; pdb.set_trace() if args.dataset_file == 'hico': wandb.log({ "loss": train_stats['loss'], "mAP": test_stats['mAP'], "mAP rare": test_stats['mAP rare'], "mAP non-rare": test_stats['mAP non-rare'], "mean max recall": test_stats['mean max recall'] }) elif args.dataset_file == 'vcoco': wandb.log({ "mAP_all": test_stats['mAP_all'], "mAP_thesis": test_stats['mAP_thesis'], "AP_hold_obj": test_stats['AP_hold_obj'], "AP_stand": test_stats['AP_stand'], "AP_sit_instr": test_stats['AP_sit_instr'], "AP_ride_instr": test_stats['AP_ride_instr'], "AP_walk": test_stats['AP_walk'], "AP_look_obj": test_stats['AP_look_obj'], "AP_hit_instr": test_stats['AP_hit_instr'], "AP_hit_obj": test_stats['AP_hit_obj'], "AP_eat_obj": test_stats['AP_eat_obj'], "AP_eat_instr": test_stats['AP_eat_instr'], "AP_jump_instr": test_stats['AP_jump_instr'], "AP_lay_instr": test_stats['AP_lay_instr'], "AP_talk_on_phone_instr": test_stats['AP_talk_on_phone_instr'], "AP_carry_obj": test_stats['AP_carry_obj'], "AP_throw_obj": test_stats['AP_throw_obj'], "AP_catch_obj": test_stats['AP_catch_obj'], "AP_cut_instr": test_stats['AP_cut_instr'], "AP_cut_obj": test_stats['AP_cut_obj'], "AP_run": test_stats['AP_run'], "AP_work_on_computer_instr": test_stats['AP_work_on_computer_instr'], "AP_ski_instr": test_stats['AP_ski_instr'], "AP_surf_instr": test_stats['AP_surf_instr'], "AP_skateboard_instr": test_stats['AP_skateboard_instr'], "AP_smile": test_stats['AP_smile'], "AP_drink_instr": test_stats['AP_drink_instr'], "AP_kick_obj": test_stats['AP_kick_obj'], "AP_point_instr": test_stats['AP_point_instr'], "AP_read_obj": test_stats['AP_read_obj'], "AP_snowboard_instr": test_stats['AP_snowboard_instr'],\ "loss" : train_stats['loss'] }) else: continue if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # for evaluation logs if coco_evaluator is not None: (output_dir / 'eval').mkdir(exist_ok=True) if "bbox" in coco_evaluator.coco_eval: filenames = ['latest.pth'] if epoch % 50 == 0: filenames.append(f'{epoch:03}.pth') for name in filenames: torch.save(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval" / name) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) # no validation ground truth for ytvos dataset dataset_train = build_dataset(image_set='train', args=args) if args.distributed: sampler_train = DistributedSampler(dataset_train) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) output_dir = Path(args.output_dir) # load coco pretrained weight checkpoint = torch.load(args.pretrained_weights, map_location='cpu')['model'] del checkpoint["vistr.class_embed.weight"] del checkpoint["vistr.class_embed.bias"] del checkpoint["vistr.query_embed.weight"] model.module.load_state_dict(checkpoint, strict=False) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 1 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) image_set = 'fewshot' if args.fewshot_finetune else 'train' dataset_train = build_dataset(image_set=image_set, args=args) dataset_val = build_dataset(image_set='val', args=args) dataset_support = build_support_dataset(image_set=image_set, args=args) if args.distributed: if args.cache_mode: sampler_train = samplers.NodeDistributedSampler(dataset_train) sampler_val = samplers.NodeDistributedSampler(dataset_val, shuffle=False) sampler_support = samplers.NodeDistributedSampler(dataset_support) else: sampler_train = samplers.DistributedSampler(dataset_train) sampler_val = samplers.DistributedSampler(dataset_val, shuffle=False) sampler_support = samplers.DistributedSampler(dataset_support) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) sampler_support = torch.utils.data.RandomSampler(dataset_support) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=False) loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True) loader_val = DataLoader(dataset_val, batch_size=args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True) loader_support = DataLoader(dataset_support, batch_size=1, sampler=sampler_support, drop_last=False, num_workers=args.num_workers, pin_memory=False) def match_name_keywords(n, name_keywords): out = False for b in name_keywords: if b in n: out = True break return out for n, p in model_without_ddp.named_parameters(): print(n) if not args.fewshot_finetune: param_dicts = [{ "params": [ p for n, p in model_without_ddp.named_parameters() if not match_name_keywords(n, args.lr_backbone_names) and not match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad ], "lr": args.lr, "initial_lr": args.lr, }, { "params": [ p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_backbone_names) and p.requires_grad ], "lr": args.lr_backbone, "initial_lr": args.lr_backbone, }, { "params": [ p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad ], "lr": args.lr * args.lr_linear_proj_mult, "initial_lr": args.lr * args.lr_linear_proj_mult, }] else: # For few-shot finetune stage, do not train sampling offsets, reference points, and embedding related parameters param_dicts = [ { "params": [p for n, p in model_without_ddp.named_parameters() if not match_name_keywords(n, args.lr_backbone_names) and \ not match_name_keywords(n, args.lr_linear_proj_names) and \ not match_name_keywords(n, args.embedding_related_names) and p.requires_grad], "lr": args.lr, "initial_lr": args.lr, }, { "params": [p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_backbone_names) and p.requires_grad], "lr": args.lr_backbone, "initial_lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, weight_decay=args.weight_decay) lr_scheduler = WarmupMultiStepLR(optimizer, args.lr_drop_milestones, gamma=0.1, warmup_epochs=args.warmup_epochs, warmup_factor=args.warmup_factor, warmup_method='linear', last_epoch=args.start_epoch - 1) if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu], find_unused_parameters=True) model_without_ddp = model.module if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.dataset.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') missing_keys, unexpected_keys = model_without_ddp.load_state_dict( checkpoint['model'], strict=False) unexpected_keys = [ k for k in unexpected_keys if not (k.endswith('total_params') or k.endswith('total_ops')) ] if len(missing_keys) > 0: print('Missing Keys: {}'.format(missing_keys)) if len(unexpected_keys) > 0: print('Unexpected Keys: {}'.format(unexpected_keys)) if args.fewshot_finetune: if args.category_codes_cls_loss: # Re-init weights of novel categories for few-shot finetune novel_class_ids = datasets.get_class_ids(args.dataset_file, type='novel') if args.num_feature_levels == 1: for novel_class_id in novel_class_ids: nn.init.normal_(model_without_ddp.category_codes_cls.L. weight[novel_class_id]) elif args.num_feature_levels > 1: for classifier in model_without_ddp.category_codes_cls: for novel_class_id in novel_class_ids: nn.init.normal_( classifier.L.weight[novel_class_id]) else: raise RuntimeError if args.eval: # Evaluate only base categories test_stats, coco_evaluator = evaluate(args, model, criterion, postprocessors, loader_val, loader_support, base_ds, device, type='base') if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval_base.pth") # Evaluate only novel categories test_stats, coco_evaluator = evaluate(args, model, criterion, postprocessors, loader_val, loader_support, base_ds, device, type='novel') if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval_novel.pth") return print("Start training...") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(args, model, criterion, loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() # Saving Checkpoints after each epoch if args.output_dir and (not args.fewshot_finetune): checkpoint_paths = [output_dir / 'checkpoint.pth'] for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) # Saving Checkpoints every args.save_every_epoch epoch(s) if args.output_dir: checkpoint_paths = [] if (epoch + 1) % args.save_every_epoch == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) # Evaluation and Logging if (epoch + 1) % args.eval_every_epoch == 0: if 'base' in args.dataset_file: evaltype = 'base' else: evaltype = 'all' if args.fewshot_finetune: evaltype = 'novel' test_stats, coco_evaluator = evaluate(args, model, criterion, postprocessors, loader_val, loader_support, base_ds, device, type=evaltype) log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters, 'evaltype': evaltype } if args.output_dir and utils.is_main_process(): with (output_dir / "results.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # for evaluation logs if coco_evaluator is not None: (output_dir / 'eval').mkdir(exist_ok=True) if "bbox" in coco_evaluator.coco_eval: filenames = ['latest.pth'] filenames.append(f'{epoch:03}.pth') for name in filenames: torch.save(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval" / name) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) output_dir = Path(args.output_dir) print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], find_unused_parameters=True) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ {"params": [p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad]}, { "params": [p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) if args.eval: dataset_val = build_dataset(image_set=args.dataset, args=args) if args.distributed: sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_val = torch.utils.data.SequentialSampler(dataset_val) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) else: dataset_train = build_dataset(image_set='train', args=args) dataset_val = build_dataset(image_set='val', args=args) if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.SequentialSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) base_ds = get_coco_api_from_dataset(dataset_val) if args.resume and args.frozen_weights: assert False elif args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) new_state_dict = {} for k in checkpoint['model']: if ("class_embed" in k) or ("bbox_embed" in k) or ("query_embed" in k): continue if ("input_proj" in k) and args.layer1_num != 3: continue new_state_dict[k] = checkpoint['model'][k] # Compare load model and current model current_param = [n for n,p in model_without_ddp.named_parameters()] current_buffer = [n for n,p in model_without_ddp.named_buffers()] load_param = new_state_dict.keys() for p in load_param: if p not in current_param and p not in current_buffer: print(p, 'NOT appear in current model. ') for p in current_param: if p not in load_param: print(p, 'NEW parameter. ') model_without_ddp.load_state_dict(new_state_dict, strict=False) else: checkpoint = torch.load(args.resume, map_location='cpu') # this is to compromise old implementation new_state_dict = {} for k in checkpoint['model']: if "bbox_embed" in k: print("bbox_embed from OLD implementation has been replaced with lines_embed") new_state_dict["lines_embed."+'.'.join(k.split('.')[1:])] = checkpoint['model'][k] else: new_state_dict[k] = checkpoint['model'][k] # compare resume model and current model current_param = [n for n,p in model_without_ddp.named_parameters()] current_buffer = [n for n,p in model_without_ddp.named_buffers()] load_param = new_state_dict.keys() #for p in load_param: #if p not in current_param and p not in current_buffer: #print(p, 'not been loaded to current model. Strict == False?') for p in current_param: if p not in load_param: print(p, 'is a new parameter. Not found from load dict.') # load model model_without_ddp.load_state_dict(new_state_dict) # load optimizer if not args.no_opt and not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) checkpoint['lr_scheduler']['step_size'] = args.lr_drop # change the lr_drop epoch lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 elif args.frozen_weights: checkpoint = torch.load(args.frozen_weights, map_location='cpu') new_state_dict = {} for k in checkpoint['model']: if "bbox_embed" in k: new_state_dict["lines_embed."+'.'.join(k.split('.')[1:])] = checkpoint['model'][k] else: new_state_dict[k] = checkpoint['model'][k] model_without_ddp.letr.load_state_dict(new_state_dict) # params encoder = {k:v for k,v in new_state_dict.items() if "encoder" in k} decoder = {k:v for k,v in new_state_dict.items() if "decoder" in k} class_embed = {k:v for k,v in new_state_dict.items() if "class_embed" in k} line_embed = {k:v for k,v in new_state_dict.items() if "lines_embed" in k} model_without_ddp.load_state_dict(encoder, strict=False) model_without_ddp.load_state_dict(decoder, strict=False) model_without_ddp.load_state_dict(class_embed, strict=False) model_without_ddp.load_state_dict(line_embed, strict=False) print('Finish load frozen_weights') else: print("NO RESUME. TRAIN FROM SCRATCH") if args.eval: test_stats = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir, args) #print('checkpoint'+ str(checkpoint['epoch'])) return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, postprocessors, data_loader_train, optimizer, device, epoch, args.clip_max_norm, args) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoints/checkpoint.pth'] # extra checkpoint before LR drop and every several epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % args.save_freq == 0: checkpoint_paths.append(output_dir / f'checkpoints/checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master({ 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) test_stats = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir, args) log_stats = {**{f'train_{k}': format(v, ".6f") for k, v in train_stats.items()}, **{f'test_{k}': format(v, ".6f") for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters} if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) checkpoint = torch.load(args.resume, map_location='cpu') model.load_state_dict(checkpoint['model'], strict=False) if torch.cuda.is_available(): model.cuda() model.eval() for img_file in os.listdir(args.imgs_dir): t0 = time.time() img_path = os.path.join(args.imgs_dir, img_file) out_imgName = './visualize/' + 'out_' + img_file[:-4] + '.png' im = Image.open(img_path) # mean-std normalize the input image (batch-size: 1) img = transform(im).unsqueeze(0) img = img.cuda() # propagate through the model outputs = model(img) out_logits, out_bbox = outputs['pred_logits'], outputs['pred_boxes'] prob = out_logits.sigmoid() topk_values, topk_indexes = torch.topk(prob.view( out_logits.shape[0], -1), 100, dim=1) scores = topk_values topk_boxes = topk_indexes // out_logits.shape[2] labels = topk_indexes % out_logits.shape[2] boxes = box_ops.box_cxcywh_to_xyxy(out_bbox) boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4)) keep = scores[0] > 0.2 boxes = boxes[0, keep] labels = labels[0, keep] # and from relative [0, 1] to absolute [0, height] coordinates im_h, im_w = im.size #print('im_h,im_w',im_h,im_w) target_sizes = torch.tensor([[im_w, im_h]]) target_sizes = target_sizes.cuda() img_h, img_w = target_sizes.unbind(1) scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1) boxes = boxes * scale_fct[:, None, :] print(time.time() - t0) #plot_results source_img = Image.open(img_path).convert("RGBA") draw = ImageDraw.Draw(source_img) print("Boxes", boxes, boxes.tolist()) for xmin, ymin, xmax, ymax in boxes[0].tolist(): draw.rectangle(((xmin, ymin), (xmax, ymax)), outline="red") source_img.save(out_imgName, "png")
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) checkpoint = torch.load(args.resume, map_location='cpu') model.load_state_dict(checkpoint['model'], strict=False) if torch.cuda.is_available(): model.cuda() model.eval() t0 = time.time() im = Image.open(args.img_path) print(im) # mean-std normalize the input image (batch-size: 1) img = transform(im).unsqueeze(0) #print('img shape = ', img[0].shape) img=img.cuda() #print(img[0].cpu().numpy()) import cv2 # cv2.imwrite('/content/content/content/Deformable-DETR/explained /sample before forward/tensor0.png', img[0][0].cpu().numpy(), ) # cv2.imwrite( '/content/content/content/Deformable-DETR/explained /sample before forward/tensor1.png', img[0][1].cpu().numpy(),) # cv2.imwrite( '/content/content/content/Deformable-DETR/explained /sample before forward/tensor2.png', img[0][2].cpu().numpy(),) # cv2.imwrite( '/content/content/content/Deformable-DETR/explained /sample before forward/tensor3.png', img[0].cpu().numpy(),) # propagate through the model outputs = model(img) print ('-----------------------------------------------------------------------------') # mean-std normalize the input image (batch-size: 1) img = transform(im).unsqueeze(0) img=img.cuda() # propagate through the model outputs = model(img) # keep only predictions with 0.7+ confidence probas = outputs['pred_logits'].softmax(-1)[0, :, :-1] keep = probas.max(-1).values > 0.6 # convert boxes from [0; 1] to image scales bboxes_scaled = V.rescale_bboxes(outputs['pred_boxes'][0, keep], im.size) V.plot_results(im, probas[keep], bboxes_scaled, '/content/Explain-Deformable-DETR/explained /output/1.png') print ('keep.nonzero()', keep) # use lists to store the outputs via up-values conv_features, enc_attn_weights, dec_attn_weights = [], [], [] hooks = [ model.backbone[-2].register_forward_hook( lambda self, input, output: conv_features.append(output) ), model.transformer.encoder.layers[-1].self_attn.register_forward_hook( lambda self, input, output: enc_attn_weights.append(output[0]) ), model.transformer.decoder.layers[-1].self_attn.register_forward_hook( lambda self, input, output: dec_attn_weights.append(output[0]) ), ] # propagate through the model outputs = model(img) for hook in hooks: hook.remove() # don't need the list anymore conv_features = conv_features[0] enc_attn_weights = enc_attn_weights[0] dec_attn_weights = dec_attn_weights[0] h, w = conv_features['0'].tensors.shape[-2:] print ('(dec_attn_weights[0, idx].view(h, w))', (dec_attn_weights.transpose(0, 1).shape)) dec_attn_weights = dec_attn_weights.transpose(0, 1).cpu() print ('bboxes_scaled', bboxes_scaled.shape) fig, axs = plt.subplots(ncols=len(bboxes_scaled), nrows=2, figsize=(22, 7)) colors = V.COLORS * 100 for idx, ax_i, (xmin, ymin, xmax, ymax) in zip(keep.nonzero(), axs.T, bboxes_scaled): ax = ax_i[0] ax.imshow(dec_attn_weights[0, idx].view(16, 16)) ax.axis('off') ax.set_title(f'query id: {idx.item()}') ax = ax_i[1] ax.imshow(im) ax.add_patch(plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, fill=False, color='blue', linewidth=3)) ax.axis('off') ax.set_title(V.CLASSES[probas[idx].argmax()]) fig.tight_layout() fig.savefig('/content/Explain-Deformable-DETR/explained /encoder feature map/1.png') # print ('model.transformer.encoder.layers[-1].self_attn', model.transformer.encoder.layers[-1].self_attn) # print ('enc_attn_weights.shape', enc_attn_weights.shape) # model.transformer.encoder.layers[-1].self_attn.register_forward_hook(printnorm) # outputs = model(img) print ('-----------------------------------------------------------------------------') out_logits, out_bbox = outputs['pred_logits'], outputs['pred_boxes'] prob = out_logits.sigmoid() topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1) scores = topk_values topk_boxes = topk_indexes // out_logits.shape[2] labels = topk_indexes % out_logits.shape[2] boxes = box_ops.box_cxcywh_to_xyxy(out_bbox) boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1,1,4)) keep = scores[0] > 0.35 boxes = boxes[0, keep] labels = labels[0, keep] # and from relative [0, 1] to absolute [0, height] coordinates im_h,im_w = im.size #print('im_h,im_w',im_h,im_w) target_sizes =torch.tensor([[im_w,im_h]]) target_sizes =target_sizes.cuda() img_h, img_w = target_sizes.unbind(1) scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1) boxes = boxes * scale_fct[:, None, :] print(time.time()-t0) #plot_results source_img = Image.open(args.img_path).convert("RGBA") draw = ImageDraw.Draw(source_img) print("Boxes",boxes,boxes.tolist()) for xmin, ymin, xmax, ymax in boxes[0].tolist(): draw.rectangle(((xmin, ymin), (xmax, ymax)), outline ="red") source_img.save('test.png', "png") results = [{'scores': s, 'labels': l, 'boxes': b} for s, l, b in zip(scores, labels, boxes)]
output_dir = "outputs" device = "cuda" seed = 42 start_epoch = 0 eval = False num_workers = 2 # distributed distributed = False world_size = 1 dist_url = "env://" # %% args = Args() utils.init_distributed_mode(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) # %% PREPARE DATA INSTRE_PATH = "/datadrive/crr/datasets/instre" TRAIN_PATH = INSTRE_PATH + "/INSTRE-S-TRAIN" TEST_PATH = INSTRE_PATH + "/INSTRE-S-TEST" class_dirs_train = glob.glob(os.path.join(TRAIN_PATH, "*/"))
def main(args): utils.init_distributed_mode(args) print("args: {}".format(args)) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) # special process to control whether freeze backbone args.model.train_backbone = args.lr_backbone > 0 model, criterion, postprocessors = build_model(args.model) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop, gamma=args.lr_gamma) dataset_train = build_dataset( image_set='train', args=args.dataset, model_stride=model_without_ddp.backbone.stride) dataset_val = build_dataset(image_set='val', args=args.dataset, model_stride=model_without_ddp.backbone.stride) if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 benchmark_test_parser = benchmark_test.get_args_parser() benchmark_test_args = benchmark_test_parser.get_defaults() benchmark_test_args.tracker.model = args.model # overwrite the parameters about network model benchmark_test_args.result_path = Path( os.path.join(args.output_dir, 'benchmark')) benchmark_test_args.dataset_path = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'benchmark') benchmark_eval_parser = benchmark_eval.get_args_parser() benchmark_eval_args = benchmark_eval_parser.get_defaults() benchmark_eval_args.tracker_path = benchmark_test_args.result_path best_eao = 0 best_ar = [0, 10] # accuracy & robustness print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) # training train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every args.model_save_step epochs if (epoch + 1) % args.lr_drop == 0 or ( epoch + 1) % args.model_save_step == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) # hack: only inference model utils.save_on_master({'model': model_without_ddp.state_dict()}, output_dir / 'checkpoint_only_inference.pth') # evalute val_stats = evaluate(model, criterion, postprocessors, data_loader_val, device, args.output_dir) log_stats = { 'epoch': epoch, **{f'train_{k}': v for k, v in train_stats.items()}, **{f'val_{k}': v for k, v in val_stats.items()}, 'n_parameters': n_parameters } if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # evualute with benchmark if utils.is_main_process(): if ( epoch + 1 ) % args.benchmark_test_step == 0 and epoch > args.benchmark_start_epoch: tracker = build_tracker(benchmark_test_args.tracker, model=model_without_ddp, postprocessors=postprocessors) benchmark_test_args.model_name = "epoch" + str(epoch) benchmark_start_time = time.time() benchmark_test.main(benchmark_test_args, tracker) benchmark_time = time.time() - benchmark_start_time benchmark_eval_args.model_name = "epoch" + str(epoch) benchmark_eval_args.tracker_prefix = "epoch" + str(epoch) eval_results = benchmark_eval.main(benchmark_eval_args) eval_result = list(eval_results.values())[0] if benchmark_test_args.dataset in ['VOT2018', 'VOT2019']: if args.output_dir: with (output_dir / str("benchmark_" + benchmark_test_args.dataset + ".txt")).open("a") as f: f.write("epoch: " + str(epoch) + ", best EAO: " + str(best_eao) + ", " + json.dumps(eval_result) + "\n") if best_eao < eval_result['EAO']: best_eao = eval_result['EAO'] if args.output_dir: best_eao_int = int(best_eao * 1000) # record: only inference model utils.save_on_master( {'model': model_without_ddp.state_dict()}, output_dir / f'checkpoint{epoch:04}_best_eao_{best_eao_int:03}_only_inference.pth' ) if best_ar[0] < eval_result['accuracy'] and best_ar[ 1] > eval_result['robustness']: best_ar[0] = eval_result['accuracy'] best_ar[1] = eval_result['robustness'] if args.output_dir: best_accuracy_int = int(best_ar[0] * 1000) best_robustness_int = int(best_ar[1] * 1000) # record: only inference model utils.save_on_master( {'model': model_without_ddp.state_dict()}, output_dir / f'checkpoint{epoch:04}_best_ar_{best_accuracy_int:03}_{best_robustness_int:03}_only_inference.pth' ) print("benchmark time: {}".format(benchmark_time)) if args.distributed: torch.distributed.barrier() total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] dataset_val = build_dataset(image_set='val', args=args) sampler_val = torch.utils.data.SequentialSampler(dataset_val) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) base_ds = get_coco_api_from_dataset(dataset_val) checkpoint = torch.load(args.resume) model_without_ddp.load_state_dict(checkpoint['model']) model_without_ddp.eval() model_without_ddp.to(device) header = 'Test:' for samples, targets in data_loader_val: samples = samples.to(device) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] outputs = model_without_ddp(samples) probas = outputs['pred_logits'].softmax(-1)[0, :, :-1] keep = probas.max(-1).values > 0.7 bboxes_scaled = rescale_bboxes(outputs['pred_boxes'][0, keep]) print(probas[keep], bboxes_scaled)
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print("number of params:", n_parameters) param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) dataset_train = build_dataset(image_set="train", args=args) dataset_val = build_dataset(image_set="val", args=args) if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader( dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers, ) data_loader_val = DataLoader( dataset_val, args.batch_size if args.batch_size < 4 else 4, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers, ) if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) elif args.dataset_file in ["cmdd", "cmdc", "wider"]: base_ds = None elif args.dataset_file == "MOT17": base_ds = dataset_val else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location="cpu") model_without_ddp.detr.load_state_dict(checkpoint["model"]) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith("https"): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location="cpu", check_hash=True) else: checkpoint = torch.load(args.resume, map_location="cpu") # NOTE: this is Bruno's hack to load stuff in model_dict = model_without_ddp.state_dict() pretrained_dict = checkpoint["model"] # hack for adding query stuff if ("query_embed.query_embed.weight" in model_dict.keys() and "query_embed.weight" in pretrained_dict.keys()): pretrained_dict[ "query_embed.query_embed.weight"] = pretrained_dict[ "query_embed.weight"] # 1. filter out unnecessary keys pretrained_dict = { k: v for k, v in pretrained_dict.items() if k in model_dict } # if finetuning skip the linear stuff if args.finetune: pretrained_dict = { k: v for k, v in pretrained_dict.items() if k not in ["class_embed.weight", "class_embed.bias"] } # 2. overwrite entries in the existing state dict model_dict.update(pretrained_dict) # 3. load new state dict model_without_ddp.load_state_dict(model_dict) if (not args.eval and not args.load_model_only and "optimizer" in checkpoint and "lr_scheduler" in checkpoint and "epoch" in checkpoint): optimizer.load_state_dict(checkpoint["optimizer"]) lr_scheduler.load_state_dict(checkpoint["lr_scheduler"]) args.start_epoch = checkpoint["epoch"] + 1 if args.eval: if args.test and args.dataset_file == "wider": if args.resume: s = args.resume.split("/")[:-1] output_dir = "/" + os.path.join(*s) else: output_dir = args.output_dir print("SAVING TEST WIDER TO ", output_dir) test_wider( model, criterion, postprocessors, dataset_val, data_loader_val, device, output_dir, ) return test_stats, coco_evaluator = evaluate( model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir, dset_file=args.dataset_file, ) if args.output_dir and coco_evaluator is not None: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch( model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm, ) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / "checkpoint.pth"] # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: checkpoint_paths.append(output_dir / f"checkpoint{epoch:04}.pth") for checkpoint_path in checkpoint_paths: utils.save_on_master( { "model": model_without_ddp.state_dict(), "optimizer": optimizer.state_dict(), "lr_scheduler": lr_scheduler.state_dict(), "epoch": epoch, "args": args, }, checkpoint_path, ) test_stats, coco_evaluator = evaluate( model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir, dset_file=args.dataset_file, ) log_stats = { **{f"train_{k}": v for k, v in train_stats.items()}, **{f"test_{k}": v for k, v in test_stats.items()}, "epoch": epoch, "n_parameters": n_parameters, } if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # for evaluation logs if coco_evaluator is not None: (output_dir / "eval").mkdir(exist_ok=True) if "bbox" in coco_evaluator.coco_eval: filenames = ["latest.pth"] if epoch % 50 == 0: filenames.append(f"{epoch:03}.pth") for name in filenames: torch.save( coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval" / name, ) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print("Training time {}".format(total_time_str))
def main(args, exp_cfg): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) print(args) # device = torch.device('cuda') device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) # model, criterion, postprocessors = build_model(args) model = SMPLXNet(exp_cfg) model.to(device) model_without_ddp = model n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) # for n, p in model_without_ddp.named_parameters(): # print(n) # dataset_train = build_dataset(image_set='train', args=args) # dataset_val = build_dataset(image_set='val', args=args) print('start build dataset') datasets = make_all_datasets(exp_cfg, split='train') # dataset_train = ConcatDataset(datasets['body']) dataset_train = ConcatDataset(datasets['body'] + datasets['hand'] + datasets['head']) print('finish build dataset') sample_weight = [ child_dataset.sample_weight for child_dataset in dataset_train.datasets ] sample_weight = np.concatenate(sample_weight, axis=0) sampler_train = torch.utils.data.sampler.WeightedRandomSampler( sample_weight, len(dataset_train)) # sampler_val = torch.utils.data.SequentialSampler(dataset_val) if args.distributed: sampler_train = samplers.DistributedSampler(sampler_train) # sampler_val = samplers.DistributedSampler(sampler_val, shuffle=False) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) collate_fn = functools.partial(collate_batch, use_shared_memory=args.num_workers > 0, return_full_imgs=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=collate_fn, num_workers=args.num_workers, pin_memory=True) # data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, # drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers, # pin_memory=True) optim_cfg = exp_cfg.get('optim', {}) optimizer = build_optimizer(model, optim_cfg) lr_scheduler = build_scheduler(optimizer, optim_cfg['scheduler']) if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module if args.pretrain: checkpoint = torch.load(args.pretrain, map_location='cpu') missing_keys, unexpected_keys = model_without_ddp.load_state_dict( checkpoint['model'], strict=False) unexpected_keys = [ k for k in unexpected_keys if not (k.endswith('total_params') or k.endswith('total_ops')) ] if len(missing_keys) > 0: print('Missing Keys: {}'.format(missing_keys)) if len(unexpected_keys) > 0: print('Unexpected Keys: {}'.format(unexpected_keys)) if args.resume: checkpoint = torch.load(args.resume, map_location='cpu') missing_keys, unexpected_keys = model_without_ddp.load_state_dict( checkpoint['model'], strict=False) unexpected_keys = [ k for k in unexpected_keys if not (k.endswith('total_params') or k.endswith('total_ops')) ] if len(missing_keys) > 0: print('Missing Keys: {}'.format(missing_keys)) if len(unexpected_keys) > 0: print('Unexpected Keys: {}'.format(unexpected_keys)) if 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: import copy p_groups = copy.deepcopy(optimizer.param_groups) optimizer.load_state_dict(checkpoint['optimizer']) for pg, pg_old in zip(optimizer.param_groups, p_groups): pg['lr'] = pg_old['lr'] pg['initial_lr'] = pg_old['initial_lr'] print(optimizer.param_groups) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) lr_scheduler.step(lr_scheduler.last_epoch) args.start_epoch = checkpoint['epoch'] + 1 print("Start training") output_dir = Path(args.output_dir) start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, data_loader_train, optimizer, device, epoch) # print('DEBUG!!!!!!!!!'); train_stats = {} lr_scheduler.step() if args.output_dir: if not os.path.exists(args.output_dir) and utils.is_main_process(): os.makedirs(args.output_dir) checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 1 epochs if (epoch + 1) % args.save_freq == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) scaler = torch.cuda.amp.GradScaler(enabled=args.fp16) if args.det_val: assert args.eval, 'only support eval mode of detector for track' model, criterion, postprocessors = build_model(args) elif args.eval: model, criterion, postprocessors = build_tracktest_model(args) else: model, criterion, postprocessors = build_tracktrain_model(args) model.to(device) model_without_ddp = model n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) dataset_train = build_dataset(image_set=args.track_train_split, args=args) dataset_val = build_dataset(image_set=args.track_eval_split, args=args) if args.distributed: if args.cache_mode: sampler_train = samplers.NodeDistributedSampler(dataset_train) sampler_val = samplers.NodeDistributedSampler(dataset_val, shuffle=False) else: sampler_train = samplers.DistributedSampler(dataset_train) sampler_val = samplers.DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True) # lr_backbone_names = ["backbone.0", "backbone.neck", "input_proj", "transformer.encoder"] def match_name_keywords(n, name_keywords): out = False for b in name_keywords: if b in n: out = True break return out for n, p in model_without_ddp.named_parameters(): print(n) param_dicts = [{ "params": [ p for n, p in model_without_ddp.named_parameters() if not match_name_keywords(n, args.lr_backbone_names) and not match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad ], "lr": args.lr, }, { "params": [ p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_backbone_names) and p.requires_grad ], "lr": args.lr_backbone, }, { "params": [ p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad ], "lr": args.lr * args.lr_linear_proj_mult, }] if args.sgd: optimizer = torch.optim.SGD(param_dicts, lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) else: optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu], find_unused_parameters=True) model_without_ddp = model.module if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') missing_keys, unexpected_keys = model_without_ddp.load_state_dict( checkpoint['model'], strict=False) unexpected_keys = [ k for k in unexpected_keys if not (k.endswith('total_params') or k.endswith('total_ops')) ] if len(missing_keys) > 0: print('Missing Keys: {}'.format(missing_keys)) if len(unexpected_keys) > 0: print('Unexpected Keys: {}'.format(unexpected_keys)) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: import copy p_groups = copy.deepcopy(optimizer.param_groups) optimizer.load_state_dict(checkpoint['optimizer']) for pg, pg_old in zip(optimizer.param_groups, p_groups): pg['lr'] = pg_old['lr'] pg['initial_lr'] = pg_old['initial_lr'] print(optimizer.param_groups) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) # todo: this is a hack for doing experiment that resume from checkpoint and also modify lr scheduler (e.g., decrease lr in advance). args.override_resumed_lr_drop = True if args.override_resumed_lr_drop: print( 'Warning: (hack) args.override_resumed_lr_drop is set to True, so args.lr_drop would override lr_drop in resumed lr_scheduler.' ) lr_scheduler.step_size = args.lr_drop lr_scheduler.base_lrs = list( map(lambda group: group['initial_lr'], optimizer.param_groups)) lr_scheduler.step(lr_scheduler.last_epoch) args.start_epoch = checkpoint['epoch'] + 1 # check the resumed model # if not args.eval: # test_stats, coco_evaluator, _ = evaluate( # model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir # ) if args.eval: assert args.batch_size == 1, print("Now only support 1.") tracker = Tracker(score_thresh=args.track_thresh) test_stats, coco_evaluator, res_tracks = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir, tracker=tracker, phase='eval', det_val=args.det_val, fp16=args.fp16) if args.output_dir: # utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") if res_tracks is not None: print("Creating video index for {}.".format(args.dataset_file)) video_to_images = defaultdict(list) video_names = defaultdict() for _, info in dataset_val.coco.imgs.items(): video_to_images[info["video_id"]].append({ "image_id": info["id"], "frame_id": info["frame_id"] }) video_name = info["file_name"].split("/")[0] if video_name not in video_names: video_names[info["video_id"]] = video_name assert len(video_to_images) == len(video_names) # save mot results. save_track(res_tracks, args.output_dir, video_to_images, video_names, args.track_eval_split) return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, scaler, epoch, args.clip_max_norm, fp16=args.fp16) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 5 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 5 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if epoch % 10 == 0 or epoch > args.epochs - 5: test_stats, coco_evaluator, _ = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir, fp16=args.fp16) log_test_stats = { **{f'test_{k}': v for k, v in test_stats.items()} } log_stats.update(log_test_stats) if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # for evaluation logs # if coco_evaluator is not None: # (output_dir / 'eval').mkdir(exist_ok=True) # if "bbox" in coco_evaluator.coco_eval: # filenames = ['latest.pth'] # if epoch % 50 == 0: # filenames.append(f'{epoch:03}.pth') # for name in filenames: # torch.save(coco_evaluator.coco_eval["bbox"].eval, # output_dir / "eval" / name) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) # align with DETR format args.dataset_file = 'ImageNet' args.masks = None # freeze cnn weights args.lr_backbone = 0 if args.fre_cnn else args.lr print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) dataset_train = build_dataset(image_set='train', args=args) if args.distributed: sampler_train = DistributedSampler(dataset_train) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.updetr_collate_fn, num_workers=args.num_workers) print(len(data_loader_train) * args.epochs) output_dir = Path(args.output_dir) if args.resume: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) if lr_scheduler.step_size != args.lr_drop: lr_scheduler.step_size = args.lr_drop args.start_epoch = checkpoint['epoch'] + 1 print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 20 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 20 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) Dataset = get_dataset(args.dataset, args.task) f = open(args.data_cfg) data_config = json.load(f) trainset_paths = data_config['train'] dataset_root = data_config['root'] f.close() normalize = T.Compose([ T.ToTensor(), T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800] transforms = T.Compose([ T.RandomHorizontalFlip(), T.RandomSelect( T.RandomResize(scales, max_size=1333), T.Compose([ T.RandomResize([400, 500, 600]), T.RandomSizeCrop(384, 600), # T.RandomSizeCrop_MOT(384, 600), T.RandomResize(scales, max_size=1333), ])), normalize, ]) dataset_train = Dataset(args, dataset_root, trainset_paths, (1088, 608), augment=True, transforms=transforms) args.nID = dataset_train.nID model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model # dataset_train = build_dataset(image_set='train', args=args) # dataset_val = build_dataset(image_set='val', args=args) if args.distributed: if args.cache_mode: sampler_train = samplers.NodeDistributedSampler(dataset_train) # sampler_val = samplers.NodeDistributedSampler(dataset_val, shuffle=False) else: sampler_train = samplers.DistributedSampler(dataset_train) # sampler_val = samplers.DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) # sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True) # data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, # drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers, # pin_memory=True) # data_loader_train = torch.utils.data.DataLoader( # dataset_train, # batch_size=args.batch_size, # shuffle=True, # num_workers=args.num_workers, # pin_memory=True, # drop_last=True # ) # lr_backbone_names = ["backbone.0", "backbone.neck", "input_proj", "transformer.encoder"] def match_name_keywords(n, name_keywords): out = False for b in name_keywords: if b in n: out = True break return out for n, p in model_without_ddp.named_parameters(): print(n) # 用于将classifer不更新参数 # for name,p in model_without_ddp.named_parameters(): # if name.startswith('classifier'): # p.requires_grad = False param_dicts = [{ "params": [ p for n, p in model_without_ddp.named_parameters() if not match_name_keywords(n, args.lr_backbone_names) and not match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad ], "lr": args.lr, }, { "params": [ p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_backbone_names) and p.requires_grad ], "lr": args.lr_backbone, }, { "params": [ p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad ], "lr": args.lr * args.lr_linear_proj_mult, }] if args.sgd: optimizer = torch.optim.SGD(param_dicts, lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) else: optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) # optimizer.add_param_group({'params': criterion.parameters()}) if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_dict = model_without_ddp.state_dict() #当前模型参数 pretrained_dict = { k: v for k, v in checkpoint['model'].items() if k not in [ "class_embed.0.weight", "class_embed.0.bias", "class_embed.1.weight", "class_embed.1.bias", "class_embed.2.weight", "class_embed.2.bias", "class_embed.3.weight", "class_embed.3.bias", "class_embed.4.weight", "class_embed.4.bias", "class_embed.5.weight", "class_embed.5.bias" ] } model_dict.update(pretrained_dict) # missing_keys, unexpected_keys = model_without_ddp.load_state_dict(checkpoint['model'], strict=False) missing_keys, unexpected_keys = model_without_ddp.load_state_dict( model_dict, strict=False) unexpected_keys = [ k for k in unexpected_keys if not (k.endswith('total_params') or k.endswith('total_ops')) ] if len(missing_keys) > 0: print('Missing Keys: {}'.format(missing_keys)) if len(unexpected_keys) > 0: print('Unexpected Keys: {}'.format(unexpected_keys)) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: args.start_epoch = checkpoint['epoch'] + 1 # optimizer.load_state_dict(checkpoint['optimizer']) # if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: # import copy # p_groups = copy.deepcopy(optimizer.param_groups) # # optimizer.load_state_dict(checkpoint['optimizer']) # for pg, pg_old in zip(optimizer.param_groups, p_groups): # pg['lr'] = pg_old['lr'] # pg['initial_lr'] = pg_old['initial_lr'] # # print(optimizer.param_groups) # lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) # # todo: this is a hack for doing experiment that resume from checkpoint and also modify lr scheduler (e.g., decrease lr in advance). # args.override_resumed_lr_drop = True # if args.override_resumed_lr_drop: # print('Warning: (hack) args.override_resumed_lr_drop is set to True, so args.lr_drop would override lr_drop in resumed lr_scheduler.') # lr_scheduler.step_size = args.lr_drop # lr_scheduler.base_lrs = list(map(lambda group: group['initial_lr'], optimizer.param_groups)) # lr_scheduler.step(lr_scheduler.last_epoch) # model.add_module('id') # [p for p in model.named_parameters() if not p[1].requires_grad] # 用于将classifer不更新参数 # optimizer = torch.optim.SGD(filter(lambda x: "classifier" not in x[0], model.parameters()), lr=args.lr, # momentum=0.9, weight_decay=1e-4) # model.classifier.training = False n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(args, model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 5 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 5 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) # IPython.embed() device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) # IPython.embed() # IPython.embed() os.system("sudo chmod -R 777 /home/shuxuang/.cache/") model, criterion, postprocessors = build_model( args) # use the same model as detr paper on coco model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) dataset_val = build_nvdataset( dataset_root=[ os.path.join(os.environ["HOME"], 'datasets/detection-f'), #test os.path.join(os.environ["HOME"], 'datasets/frames_nvidia') ], mode='test', camera=args.camera) # dataset_val = build_nvdataset(dataset_root=[args.dataset_root_test, args.dataset_root_sql], # mode='test', camera=args.camera) print("Validation samples: %d" % (len(dataset_val))) IPython.embed() # compute how many boxes in the test dataset for each image # accumulate_bboxes_numbers(dataset_val) # dataset_train_ = build_nvdataset(dataset_root=[ # os.path.join(os.environ["HOME"],'datasets/annotation_sql_nvidia'), # os.path.join(os.environ["HOME"], 'datasets/frames_nvidia')], # mode='train', camera=args.camera) # # indices_50k =np.load(os.path.join(os.environ["HOME"],'datasets/id_1_criterion_Max_SSD_num_labels_50000.npy')) # # dataset_train = Subset(dataset_train_, indices_50k) # print("Train samples: %d"%(len(dataset_train_))) # print(len(dataset_val)) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) # args.resume = os.path.join(os.environ["HOME"], 'datasets/exps_detr_base/checkpoint0299.pth') # args.resume = '/home/shuxuang/datasets/exps_detr_base/checkpoint0299.pth' log_path = args.resume log = os.path.join(args.resume, 'log.txt') # read_log(log) # IPython.embed() args.resume = os.path.join(args.resume, 'checkpoint.pth') print(args.resume) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: print('Loading model: %s' % args.resume) checkpoint = torch.load(args.resume, map_location='cpu') print('Load model from %d epoch' % checkpoint['epoch']) model_without_ddp.load_state_dict(checkpoint['model']) if args.eval: vis_bboxes(model, dataset_val, postprocessors, device) # inference_time(model, dataset_val, postprocessors, device) return model, dataset_val, postprocessors, device
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) dataset_train = build_dataset(image_set='train', args=args) dataset_val = build_dataset(image_set='val', args=args) dataset_test = build_dataset(image_set='test', args=args) if args.distributed: if args.cache_mode: sampler_train = samplers.NodeDistributedSampler(dataset_train) sampler_val = samplers.NodeDistributedSampler(dataset_val, shuffle=False) else: sampler_train = samplers.DistributedSampler(dataset_train) sampler_val = samplers.DistributedSampler(dataset_val, shuffle=False) sampler_test = samplers.DistributedSampler(dataset_test, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) sampler_test = torch.utils.data.SequentialSampler(dataset_test) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True) data_loader_test = DataLoader(dataset_test, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True) # lr_backbone_names = ["backbone.0", "backbone.neck", "input_proj", "transformer.encoder"] def match_name_keywords(n, name_keywords): out = False for b in name_keywords: if b in n: out = True break return out for n, p in model_without_ddp.named_parameters(): print(n) param_dicts = [{ "params": [ p for n, p in model_without_ddp.named_parameters() if not match_name_keywords(n, args.lr_backbone_names) and not match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad ], "lr": args.lr, }, { "params": [ p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_backbone_names) and p.requires_grad ], "lr": args.lr_backbone, }, { "params": [ p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad ], "lr": args.lr * args.lr_linear_proj_mult, }] if args.sgd: optimizer = torch.optim.SGD(param_dicts, lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) else: optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') del checkpoint["model"]["transformer.decoder.class_embed.0.weight"] del checkpoint["model"]["transformer.decoder.class_embed.0.bias"] del checkpoint["model"]["transformer.decoder.class_embed.1.weight"] del checkpoint["model"]["transformer.decoder.class_embed.1.bias"] del checkpoint["model"]["transformer.decoder.class_embed.2.weight"] del checkpoint["model"]["transformer.decoder.class_embed.2.bias"] del checkpoint["model"]["transformer.decoder.class_embed.3.weight"] del checkpoint["model"]["transformer.decoder.class_embed.3.bias"] del checkpoint["model"]["transformer.decoder.class_embed.4.weight"] del checkpoint["model"]["transformer.decoder.class_embed.4.bias"] del checkpoint["model"]["transformer.decoder.class_embed.5.weight"] del checkpoint["model"]["transformer.decoder.class_embed.5.bias"] del checkpoint["model"]["transformer.decoder.class_embed.6.weight"] del checkpoint["model"]["transformer.decoder.class_embed.6.bias"] del checkpoint["model"]["class_embed.0.weight"] del checkpoint["model"]["class_embed.0.bias"] del checkpoint["model"]["class_embed.1.weight"] del checkpoint["model"]["class_embed.1.bias"] del checkpoint["model"]["class_embed.2.weight"] del checkpoint["model"]["class_embed.2.bias"] del checkpoint["model"]["class_embed.3.weight"] del checkpoint["model"]["class_embed.3.bias"] del checkpoint["model"]["class_embed.4.weight"] del checkpoint["model"]["class_embed.4.bias"] del checkpoint["model"]["class_embed.5.weight"] del checkpoint["model"]["class_embed.5.bias"] del checkpoint["model"]["class_embed.6.weight"] del checkpoint["model"]["class_embed.6.bias"] missing_keys, unexpected_keys = model_without_ddp.load_state_dict( checkpoint['model'], strict=False) unexpected_keys = [ k for k in unexpected_keys if not (k.endswith('total_params') or k.endswith('total_ops')) ] # if len(missing_keys) > 0: # print('Missing Keys: {}'.format(missing_keys)) # if len(unexpected_keys) > 0: # print('Unexpected Keys: {}'.format(unexpected_keys)) # if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: # import copy # p_groups = copy.deepcopy(optimizer.param_groups) # optimizer.load_state_dict(checkpoint['optimizer']) # for pg, pg_old in zip(optimizer.param_groups, p_groups): # pg['lr'] = pg_old['lr'] # pg['initial_lr'] = pg_old['initial_lr'] # #print(optimizer.param_groups) # lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) # # todo: this is a hack for doing experiment that resume from checkpoint and also modify lr scheduler (e.g., decrease lr in advance). # args.override_resumed_lr_drop = True # if args.override_resumed_lr_drop: # print('Warning: (hack) args.override_resumed_lr_drop is set to True, so args.lr_drop would override lr_drop in resumed lr_scheduler.') # lr_scheduler.step_size = args.lr_drop # lr_scheduler.base_lrs = list(map(lambda group: group['initial_lr'], optimizer.param_groups)) # lr_scheduler.step(lr_scheduler.last_epoch) # args.start_epoch = checkpoint['epoch'] + 1 # # check the resumed model if not args.eval: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.eval: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") return if args.test: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_test, base_ds, device, args.output_dir) if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 5 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 5 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # for evaluation logs if coco_evaluator is not None: (output_dir / 'eval').mkdir(exist_ok=True) if "bbox" in coco_evaluator.coco_eval: filenames = ['latest.pth'] if epoch % 50 == 0: filenames.append(f'{epoch:03}.pth') for name in filenames: torch.save(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval" / name) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) dataset_train = build_dataset(split='train', args=args) dataset_val = build_dataset(split='val', args=args) if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) elif args.dataset_file == "coco": base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 # if args.eval: # if 'coco' in args.dataset_file: # test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, # data_loader_val, base_ds, device, args.output_dir) # if args.output_dir: # utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") # elif 'anet' == args.dataset_file: # evaluate3d(model, postprocessors, data_loader_val, device, epoch=0) # return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) if epoch % args.eval_freq == 0: if 'coco' in args.dataset_file: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) elif 'anet' == args.dataset_file: evaluate3d(model, postprocessors, data_loader_val, device, epoch)
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) # IPython.embed() # IPython.embed() os.system("sudo chmod -R 777 /home/shuxuang/.cache/") model, criterion, postprocessors = build_model( args) # use the same model as detr paper on coco model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) # optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, # weight_decay=args.weight_decay) # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) # dataset_train = build_dataset(image_set='train', args=args) # dataset_val = build_dataset(image_set='val', args=args) # modify the dataset from coco to nvdata # home_dir = os.environ["HOME"] # # on local # dataset_train_ = build_nvdataset(dataset_root=[ # os.path.join(os.environ["HOME"],'datasets/annotation_sql_nvidia'), # os.path.join(os.environ["HOME"], 'datasets/frames_nvidia')], # mode='train') # dataset_val = build_nvdataset(dataset_root=[ # os.path.join(os.environ["HOME"],'datasets/test'), # os.path.join(os.environ["HOME"], 'datasets/frames_nvidia')], # mode='test', camera=args.camera) dataset_val = build_nvdataset(dataset_root=[ os.path.join(os.environ["HOME"], 'datasets/test'), os.path.join(os.environ["HOME"], 'datasets/test') ], mode='test', camera=args.camera) # indices_50k =np.load(os.path.join(os.environ["HOME"],'datasets/id_1_criterion_Max_SSD_num_labels_50000.npy')) # # on maglev # dataset_train_ = build_nvdataset(dataset_root=[args.dataset_root_sql, args.dataset_root_img], # mode='train') # dataset_val = build_nvdataset(dataset_root=[args.dataset_root_test, args.dataset_root_sql], # mode='test', camera=args.camera) # indices_50k =np.load(os.path.join(args.root_indices)) # dataset_train = Subset(dataset_train_, indices_50k) print("Validation samples: %d" % (len(dataset_val))) # IPython.embed() if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) # args.resume = os.path.join(os.environ["HOME"], 'datasets/exps_detr_base/checkpoint0299.pth') # args.resume = '/home/shuxuang/datasets/exps_detr_base/checkpoint0299.pth' print(args.resume) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: print('Loading model: %s' % args.resume) checkpoint = torch.load(args.resume, map_location='cpu') print('Load model from %d epoch' % (checkpoint['epoch'] + 1)) model_without_ddp.load_state_dict(checkpoint['model']) if args.eval: if args.dataset_file == 'nvdata': evaluate(model, dataset_val, postprocessors, device) else: evaluate_5classes(model, dataset_val, postprocessors, device) return model, dataset_val, postprocessors, device
def main(args): utils.init_distributed_mode(args) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) dataset_train = build_dataset(image_set='train', args=args) dataset_val = build_dataset(image_set='val', args=args) if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) data_loader_val = DataLoader(dataset_val, batch_size=1, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: io.load_frozen(args, model_without_ddp) output_dir = Path(args.output_dir) if args.resume: io.resume(args, model_without_ddp, optimizer, lr_scheduler) elif args.finetune: io.finetune(args, model_without_ddp) if args.eval: if args.output_dir and utils.is_main_process(): io.init_wandb(args.dataset_file + "-detr-eval", model, args, n_parameters=n_parameters) test_stats, evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.output_dir: io.save_on_master(evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") return print("Start training") start_time = time.time() if args.output_dir and utils.is_main_process(): io.init_wandb(args.dataset_file + "-detr", model, args, n_parameters=n_parameters) for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: io.save_checkpoint(args, model_without_ddp, optimizer, lr_scheduler, epoch) test_stats, evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir, epoch) if utils.is_main_process() and args.output_dir: io.log_wandb(train_stats, test_stats) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) # save final model if utils.is_main_process() and args.output_dir: io.save_on_master(model_without_ddp, output_dir / "model_final.pth") print('Training time {}'.format(total_time_str))
def main(args): # args = parser.parse_args() utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) print(args) if args.seed is not None: # random.seed(args.seed) # torch.manual_seed(args.seed) cudnn.deterministic = True warnings.warn('You have chosen to seed training. ' 'This will turn on the CUDNN deterministic setting, ' 'which can slow down your training considerably! ' 'You may see unexpected behavior when restarting ' 'from checkpoints.') # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) ################################## # Logging setting ################################## if args.output_dir and utils.is_main_process(): logging.basicConfig( filename=os.path.join(args.output_dir, args.log_name), filemode='w', format= '%(asctime)s: %(levelname)s: [%(filename)s:%(lineno)d]: %(message)s', level=logging.INFO) warnings.filterwarnings("ignore") ################################## # Save to logging ################################## if utils.is_main_process(): logging.info(str(args)) ################################## # Initialize dataset ################################## if not args.evaluate: # build_vocab_flag=True, # Takes a long time to build a vocab train_dataset = GQATorchDataset(split='train_unbiased', build_vocab_flag=False, load_vocab_flag=False) if args.distributed: sampler_train = torch.utils.data.DistributedSampler(train_dataset) else: sampler_train = torch.utils.data.RandomSampler(train_dataset) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) train_loader = torch.utils.data.DataLoader( train_dataset, batch_sampler=batch_sampler_train, collate_fn=GQATorchDataset_collate_fn, num_workers=args.workers) # Old version # train_loader = torch.utils.data.DataLoader( # train_dataset, batch_size=args.batch_size, shuffle=True, # collate_fn=GQATorchDataset_collate_fn, # num_workers=args.workers, pin_memory=True) val_dataset_list = [] for eval_split in args.evaluate_sets: val_dataset_list.append( GQATorchDataset(split=eval_split, build_vocab_flag=False, load_vocab_flag=args.evaluate)) val_dataset = torch.utils.data.ConcatDataset(val_dataset_list) if args.distributed: sampler_val = torch.utils.data.DistributedSampler(val_dataset, shuffle=False) else: sampler_val = torch.utils.data.SequentialSampler(val_dataset) val_loader = torch.utils.data.DataLoader( val_dataset, batch_size=args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=GQATorchDataset_collate_fn, num_workers=args.workers) # Old version # val_loader = torch.utils.data.DataLoader( # val_dataset, # batch_size=args.batch_size, shuffle=False, # collate_fn=GQATorchDataset_collate_fn, # num_workers=args.workers, pin_memory=True) ################################## # Initialize model # - note: must init dataset first. Since we will use the vocab from the dataset ################################## model = PipelineModel() ################################## # Deploy model on GPU ################################## model = model.to(device=cuda) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu], find_unused_parameters=True) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) ################################## # define optimizer (and scheduler) ################################## # optimizer = torch.optim.SGD(model.parameters(), args.lr, # momentum=args.momentum, # weight_decay=args.weight_decay) optimizer = torch.optim.Adam( params=model.parameters(), lr=args.lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, # weight_decay=args.weight_decay amsgrad=False, ) # optimizer = torch.optim.AdamW( # params=model.parameters(), # lr=args.lr, # weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) model_without_ddp.load_state_dict(checkpoint['model']) if not args.evaluate: if 'optimizer' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) if 'lr_scheduler' in checkpoint: lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) if 'epoch' in checkpoint: args.start_epoch = checkpoint['epoch'] + 1 # checkpoint = torch.load(args.resume) # args.start_epoch = checkpoint['epoch'] # model.load_state_dict(checkpoint['state_dict']) # optimizer.load_state_dict(checkpoint['optimizer']) # print("=> loaded checkpoint '{}' (epoch {})" # .format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # cudnn.benchmark = True ################################## # Define loss functions (criterion) ################################## # criterion = torch.nn.CrossEntropyLoss().cuda() text_pad_idx = GQATorchDataset.TEXT.vocab.stoi[ GQATorchDataset.TEXT.pad_token] criterion = { "program": torch.nn.CrossEntropyLoss(ignore_index=text_pad_idx).to(device=cuda), "full_answer": torch.nn.CrossEntropyLoss(ignore_index=text_pad_idx).to(device=cuda), "short_answer": torch.nn.CrossEntropyLoss().to(device=cuda), # "short_answer": torch.nn.BCEWithLogitsLoss().to(device=cuda), # sigmoid "execution_bitmap": torch.nn.BCELoss().to(device=cuda), } ################################## # If Evaluate Only ################################## if args.evaluate: validate(val_loader, model, criterion, args, DUMP_RESULT=True) return ################################## # Main Training Loop ################################## # best_acc1 = 0 for epoch in range(args.start_epoch, args.epochs): if args.distributed: ################################## # In distributed mode, calling the :meth`set_epoch(epoch) <set_epoch>` method # at the beginning of each epoch before creating the DataLoader iterator is necessary # to make shuffling work properly across multiple epochs. # Otherwise, the same ordering will be always used. ################################## sampler_train.set_epoch(epoch) lr_scheduler.step() # adjust_learning_rate(optimizer, epoch, args) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args) # evaluate on validation set if (epoch + 1) % 5 == 0: validate(val_loader, model, criterion, args, FAST_VALIDATE_FLAG=False) # # remember best acc@1 and save checkpoint # save_checkpoint({ # 'epoch': epoch + 1, # # 'arch': args.arch, # 'state_dict': model.state_dict(), # # 'best_acc1': best_acc1, # 'optimizer' : optimizer.state_dict(), # }, is_best) if args.output_dir: output_dir = pathlib.Path(args.output_dir) checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path)
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) dataset_train = build_dataset(image_set='train', args=args) dataset_val = build_dataset(image_set='val', args=args) if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model'], strict=False) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.eval: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") return #cab writer = SummaryWriter("runs/" + args.tb_name) best_value = 0 print("Start training, best_value is " + str(best_value)) start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) #cab for k, v in train_stats.items(): if isinstance(v, float): writer.add_scalar(f'train_{k}', v, epoch) new_value = 0 for k, v in test_stats.items(): if (isinstance(v, float)): writer.add_scalar(f'test_{k}', v, epoch) if (k == "coco_eval_bbox"): new_value = v[0] writer.add_scalar( 'Bbox Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ]', v[0], epoch) writer.add_scalar( 'Bbox Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ]', v[1], epoch) writer.add_scalar( 'Bbox Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ]', v[2], epoch) writer.add_scalar( 'Bbox Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ]', v[3], epoch) writer.add_scalar( 'Bbox Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ]', v[4], epoch) writer.add_scalar( 'Bbox Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ]', v[5], epoch) writer.add_scalar( 'Bbox Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ]', v[6], epoch) writer.add_scalar( 'Bbox Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ]', v[7], epoch) writer.add_scalar( 'Bbox Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ]', v[8], epoch) writer.add_scalar( 'Bbox Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ]', v[9], epoch) writer.add_scalar( 'Bbox Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ]', v[10], epoch) writer.add_scalar( 'Bbox Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ]', v[11], epoch) if (k == "coco_eval_masks"): new_value = v[0] writer.add_scalar( 'Mask Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ]', v[0], epoch) writer.add_scalar( 'Mask Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ]', v[1], epoch) writer.add_scalar( 'Mask Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ]', v[2], epoch) writer.add_scalar( 'Mask Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ]', v[3], epoch) writer.add_scalar( 'Mask Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ]', v[4], epoch) writer.add_scalar( 'Mask Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ]', v[5], epoch) writer.add_scalar( 'Mask Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ]', v[6], epoch) writer.add_scalar( 'Mask Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ]', v[7], epoch) writer.add_scalar( 'Mask Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ]', v[8], epoch) writer.add_scalar( 'Mask Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ]', v[9], epoch) writer.add_scalar( 'Mask Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ]', v[10], epoch) writer.add_scalar( 'Mask Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ]', v[11], epoch) print("Epoch finished, best_value is " + str(best_value)) save_pth = False if best_value < new_value: best_value = new_value save_pth = True if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') if save_pth: checkpoint_paths.append(output_dir / f'best.pth') bestLog = open(output_dir / 'best_log.txt', 'w+') bestLog.write(f'Saved model at epoch {epoch:04}\n') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) #/cab log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # for evaluation logs if coco_evaluator is not None: (output_dir / 'eval').mkdir(exist_ok=True) if "bbox" in coco_evaluator.coco_eval: filenames = ['latest.pth'] if epoch % 50 == 0: filenames.append(f'{epoch:03}.pth') for name in filenames: torch.save(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval" / name) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop, gamma=0.9) dataset_train = build_dataset(image_set='train', args=args) dataset_val = build_dataset(image_set='val', args=args) if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) output_dir = output_dir / f"{args.backbone}_{args.transformer_type}" if args.output_dir: output_dir.mkdir(parents=True, exist_ok=True) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.eval: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / f'checkpoint_{epoch}.pth'] # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch}_extra.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # for evaluation logs if coco_evaluator is not None: (output_dir / 'eval').mkdir(exist_ok=True) if "bbox" in coco_evaluator.coco_eval: filenames = ['latest.pth'] if epoch % 50 == 0: filenames.append(f'{epoch:03}.pth') for name in filenames: torch.save(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval" / name) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) device = torch.device(args.device) args.batch_size = 1 print(args) if args.dataset_config is not None: # https://stackoverflow.com/a/16878364 d = vars(args) with open(args.dataset_config, "r") as f: cfg = json.load(f) d.update(cfg) # fix the seed for reproducibility seed = args.seed + dist.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) if args.resume.startswith("https"): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location="cpu", check_hash=True) else: checkpoint = torch.load(args.resume, map_location="cpu") model_args = checkpoint["args"] for a in vars(args): if a not in vars(model_args): vars(model_args)[a] = vars(args)[a] model_args.device = args.device model, _, _, _, _ = build_model(model_args) model.to(device) with open(Path(args.lvis_minival_path) / "lvis_v1_minival.json", "r") as f: lvis_val = json.load(f) id2cat = {c["id"]: c for c in lvis_val["categories"]} all_cats = sorted(list(id2cat.keys())) # model_without_ddp = model # if args.distributed: # model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], find_unused_parameters=True) # model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print("number of params:", n_parameters) dset = build_dataset("lvis", image_set="minival", args=args) sampler = DistributedSampler(dset, shuffle=False) if args.distributed else torch.utils.data.SequentialSampler(dset) dataloader = DataLoader( dset, args.batch_size, sampler=sampler, drop_last=False, collate_fn=partial(utils.collate_fn, False), num_workers=args.num_workers, ) if args.test: evaluator = LvisDumper(fixed_ap=True, out_path=os.path.join(args.output_dir, "lvis_eval")) else: evaluator = LvisEvaluatorFixedAP(dset.lvis, fixed_ap=True) postprocessor = build_postprocessors(args, "lvis") model.load_state_dict(checkpoint["model_ema"], strict=False) model.eval() label_set = torch.as_tensor(list(all_cats)) print("label_set", len(label_set)) text_memories = [] print("encoding text...") splits = torch.split(label_set, 32) for split in tqdm(splits): captions = [f"{clean_name(id2cat[l]['name'])}" for l in split.tolist()] tokenized = model.transformer.tokenizer.batch_encode_plus(captions, padding="longest", return_tensors="pt").to( device ) encoded_text = model.transformer.text_encoder(**tokenized) # Transpose memory because pytorch's attention expects sequence first text_memory = encoded_text.last_hidden_state.transpose(0, 1) # Invert attention mask that we get from huggingface because its the opposite in pytorch transformer text_attention_mask = tokenized.attention_mask.ne(1).bool() # Resize the encoder hidden states to be of the same d_model as the decoder text_memory_resized = model.transformer.resizer(text_memory) text_memories.append((text_attention_mask, text_memory_resized, tokenized)) for batch_dict in tqdm(dataloader): samples = batch_dict["samples"].to(device) targets = batch_dict["targets"] targets = [ { k: v.to(device) if k not in ["tokens_positive", "tokens", "dataset_name"] else v for k, v in t.items() if k != "caption" } for t in targets ] assert len(targets) == 1 t = targets[0] with torch.no_grad(): features, orig_pos = model.backbone(samples) orig_src, orig_mask = features[-1].decompose() res = [] for i in range(len(text_memories)): # captions = [f"{clean_name(id2cat[l]['name'])}" for l in split.tolist()] bs = len(splits[i]) src = orig_src.repeat(bs, 1, 1, 1) mask = orig_mask.repeat(bs, 1, 1, 1) pos = deepcopy(orig_pos) pos[0] = pos[0].repeat(bs, 1, 1, 1) memory_cache = model.transformer( model.input_proj(src), mask, model.query_embed.weight, pos[-1], text_memories[i], # captions, encode_and_save=True, text_memory=None, img_memory=None, text_attention_mask=None, ) out = model(samples, captions, encode_and_save=False, memory_cache=memory_cache) orig_target_sizes = torch.stack([t["orig_size"] for _ in range(bs)], dim=0) results = postprocessor["bbox"](out, orig_target_sizes) assert len(results) == len(splits[i]) for j in range(len(results)): results[j]["labels"] *= splits[i][j].item() for output in results: res.append((t["image_id"].item(), output)) evaluator.update(res) evaluator.synchronize_between_processes() evaluator.summarize()