model = TextGenerator(embedding_dim=args.emb_dim, hidden_dim=args.hid_dim, vocab_size=vocab_size) model = model.to(device) criterion = nn.NLLLoss() optimizer = torch.optim.Adam(model.parameters()) timestr = time.strftime("%Y%m%d_%H%M%S") writer = SummaryWriter(os.path.join(args.path_to_logdir, timestr)) best_acc = 0. for epoch in range(0, args.n_epochs): print("Epoch %d" % epoch) train_one_epoch(model, train_loader, optimizer, criterion, device, writer, epoch) acc = test_model(model, val_loader, criterion, writer, device, epoch) if acc > best_acc: best_acc = acc torch.save(model, os.path.join(args.path_to_model, "text_gen_best.pth")) writer.close() text_generated = generate_text(model, args.input_text, char2idx, idx2char, device, num_generate=args.n_generate)
def main(): device = configs.device # Number of class: background and person. num_classes = 2 # Datasets. dataset = PennFudanDataset('../data/PennFudanPed/', get_transform(train=True)) dataset_test = PennFudanDataset('../data/PennFudanPed/', get_transform(train=False)) # Split the dataset in train and test set. indices = torch.randperm(len(dataset)).tolist() dataset = torch.utils.data.Subset(dataset, indices[:-50]) dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:]) # Define training and validation data loaders. data_loader = torch.utils.data.DataLoader(dataset, batch_size=2, shuffle=True, num_workers=1, collate_fn=utils.collate_fn) data_loader_test = torch.utils.data.DataLoader(dataset_test, batch_size=1, shuffle=False, num_workers=1, collate_fn=utils.collate_fn) # Get the model using our helper function. model = get_model_instance_segmentation(num_classes) # Move model to the right device. # model.to(device) # Construct an optimizer. params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) # and a learning rate scheduler. lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1) # Let's train it for 10 epochs. num_epochs = 2 #10 for epoch in range(num_epochs): # Train for one epoch. train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10) # Update the learning rate. lr_scheduler.step() # Evaluate on the test dataset. evaluate(model, data_loader_test, device=device) print('That is all!') model.eval() for test_images, targets in data_loader_test: test_images = [images.to(device) for images in list(test_images)] output = model(test_images) show_landmark_roi(test_images[0], output[0]) break
num_epochs = 24 save_param = "trained_param_bishop_tl_dem1/epoch_{:04d}.param".format( init_epoch) torch.save(mask_rcnn.state_dict(), save_param) #''' for epoch in range(init_epoch, init_epoch + num_epochs): save_param = "trained_param_bishop_tl_dem1/epoch_{:04d}.param".format( epoch) #torch.save(mask_rcnn.state_dict(), save_param) # train for one epoch, printing every 10 iterations print(save_param) train_one_epoch(mask_rcnn, optimizer, data_loader, device, epoch, print_freq=100) # update the learning rate lr_scheduler.step() # evaluate on the test dataset #print('\n') #print("trained_param_4/epoch_00%02d.param" % epoch) #mask_rcnn.load_state_dict(torch.load("trained_param_4/epoch_00%02d.param" % epoch)) evaluate(mask_rcnn, data_loader_test, device=device) #save_param = "trained_param_8_fresh/epoch_{:04d}.param".format(epoch) torch.save(mask_rcnn.state_dict(), save_param) ''' for epoch in range(init_epoch, init_epoch + num_epochs):
def main(): PATH = '/media/jefftian/44f36ce2-18b3-4775-952e-6152eedda284/ZTY/data/PennFudanPed' # PATH = '/home/ZTY/data/PennFudanPed' save_pic_path = '/media/jefftian/44f36ce2-18b3-4775-952e-6152eedda284/ZTY/Fudan_imaging' make_and_clear_path(save_pic_path) device = torch.device( 'cuda:1') if torch.cuda.is_available() else torch.device('cpu') # 现在不知道什么情况,16号gpu就是用不了,engine里面的train one epoch会有问题, 如下: # RuntimeError: radix_sort: failed on 1st step: cudaErrorInvalidDevice: invalid device ordinal # our dataset has two classes only - background and person num_classes = 2 # use our dataset and defined transformations, data saved at PATH dataset = PennFudanDataset(PATH, get_transform(train=True)) dataset_test = PennFudanDataset(PATH, get_transform(train=False)) # split the dataset in train and test set indices = torch.randperm(len(dataset)).tolist() dataset = torch.utils.data.Subset(dataset, indices[:-50]) dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:]) # define training and validation data loaders data_loader = torch.utils.data.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=0, collate_fn=utils.collate_fn) data_loader_test = torch.utils.data.DataLoader(dataset_test, batch_size=1, shuffle=False, num_workers=0, collate_fn=utils.collate_fn) # get the model using our helper function model = get_model_instance_segmentation(num_classes) ''' # 用肿瘤的拿过来 model.load_state_dict(torch.load('/media/jefftian/44f36ce2-18b3-4775-952e-6152eedda284/ZTY/' + 'saved_models/maskrcnn_resnet50_2_ins_seg_S2.pth')) ''' # move model to the right device model.to(device) # construct an optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) # and a learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1) # let's train it for 10 epochs num_epochs = 10 # 先进行pretrain效果记录 evaluate(model, data_loader_test, check_num=20, device=device, save_pic_path=save_pic_path) print("done") for epoch in range(1, num_epochs + 1): # train for one epoch, printing every 10 iterations train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10) # update the learning rate lr_scheduler.step() # evaluate on the test dataset evaluate(model, data_loader_test, check_num=20, device=device, epoch_num=epoch, save_pic_path=save_pic_path) print("That's it!")
def main(): # ------ Parse config ---------- with open('config.yaml') as f: cfg = addict.Dict(yaml.safe_load(f)) os.environ['CUDA_VISIBLE_DEVICES'] = cfg.device data = pd.read_csv(cfg.dataset_path) image_size = cfg.image_size pic_dir = cfg.pic_dir split_rate = cfg.split_rate num_classes = cfg.num_classes num_epochs = cfg.num_epochs checkpoint_path = cfg.checkpoint_path logdir = cfg.logdir device = torch.device('cuda:0') # -------- Train and validation datasets ------- train_size = int(data.shape[0] * split_rate) validation_size = data.shape[0] - train_size data = data.sample(frac=1) train_col = ['train'] * train_size + ['val'] * validation_size data['train'] = train_col train_df = data[data['train'] == 'train'].reset_index(drop=True) train_dataset = utils.FruitImagesDataset(pic_dir, image_size, image_size, train_df, transforms=get_transform(True)) data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=8, shuffle=True, num_workers=4, collate_fn=utils.collate_fn, ) val_df = data[data['train'] == 'val'].reset_index(drop=True) val_df.reset_index(drop=True).to_csv('validation.csv', index=False) val_dataset = utils.FruitImagesDataset(pic_dir, image_size, image_size, val_df, transforms=get_transform(True)) val_data_loader = torch.utils.data.DataLoader(val_dataset, batch_size=8, num_workers=4, collate_fn=utils.collate_fn) # ------ Model for object detection ------ model = get_object_detection_model(num_classes) model.cuda() params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1) num_epochs = 10 writer = SummaryWriter(logdir) for epoch in range(num_epochs): train_one_epoch( model, optimizer, data_loader, torch.device('cuda:0'), epoch, writer=writer, print_freq=20, ) save_checkpoint(epoch, model, optimizer, checkpoint_path) lr_scheduler.step() evaluate(model, val_data_loader, device, writer) print('dats it')
def main(args): utils.init_distributed_mode(args) print(args) if args.distillation_type != 'none' and args.finetune and not args.eval: raise NotImplementedError("Finetuning with distillation not yet supported") device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) # random.seed(seed) cudnn.benchmark = True dataset_train, args.nb_classes = build_dataset(is_train=True, args=args) dataset_val, _ = build_dataset(is_train=False, args=args) if True: # args.distributed: num_tasks = utils.get_world_size() global_rank = utils.get_rank() if args.repeated_aug: sampler_train = RASampler( dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True ) else: sampler_train = torch.utils.data.DistributedSampler( dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True ) if args.dist_eval: if len(dataset_val) % num_tasks != 0: print('Warning: Enabling distributed evaluation with an eval dataset not divisible by process number. ' 'This will slightly alter validation results as extra duplicate entries are added to achieve ' 'equal num of samples per-process.') sampler_val = torch.utils.data.DistributedSampler( dataset_val, num_replicas=num_tasks, rank=global_rank, shuffle=False) else: sampler_val = torch.utils.data.SequentialSampler(dataset_val) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) data_loader_train = torch.utils.data.DataLoader( dataset_train, sampler=sampler_train, batch_size=args.batch_size, num_workers=args.num_workers, pin_memory=args.pin_mem, drop_last=True, ) data_loader_val = torch.utils.data.DataLoader( dataset_val, sampler=sampler_val, batch_size=int(1.5 * args.batch_size), num_workers=args.num_workers, pin_memory=args.pin_mem, drop_last=False ) mixup_fn = None mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None if mixup_active: mixup_fn = Mixup( mixup_alpha=args.mixup, cutmix_alpha=args.cutmix, cutmix_minmax=args.cutmix_minmax, prob=args.mixup_prob, switch_prob=args.mixup_switch_prob, mode=args.mixup_mode, label_smoothing=args.smoothing, num_classes=args.nb_classes) print(f"Creating model: {args.model}") model = create_model( args.model, pretrained=False, num_classes=args.nb_classes, drop_rate=args.drop, drop_path_rate=args.drop_path, drop_block_rate=None, ) if args.finetune: if args.finetune.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url( args.finetune, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.finetune, map_location='cpu') checkpoint_model = checkpoint['model'] state_dict = model.state_dict() for k in ['head.weight', 'head.bias', 'head_dist.weight', 'head_dist.bias']: if k in checkpoint_model and checkpoint_model[k].shape != state_dict[k].shape: print(f"Removing key {k} from pretrained checkpoint") del checkpoint_model[k] # interpolate position embedding pos_embed_checkpoint = checkpoint_model['pos_embed'] embedding_size = pos_embed_checkpoint.shape[-1] num_patches = model.patch_embed.num_patches num_extra_tokens = model.pos_embed.shape[-2] - num_patches # height (== width) for the checkpoint position embedding orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5) # height (== width) for the new position embedding new_size = int(num_patches ** 0.5) # class_token and dist_token are kept unchanged extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] # only the position tokens are interpolated pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2) pos_tokens = torch.nn.functional.interpolate( pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False) pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) checkpoint_model['pos_embed'] = new_pos_embed model.load_state_dict(checkpoint_model, strict=False) model.to(device) model_ema = None if args.model_ema: # Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper model_ema = ModelEma( model, decay=args.model_ema_decay, device='cpu' if args.model_ema_force_cpu else '', resume='') model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) linear_scaled_lr = args.lr * args.batch_size * utils.get_world_size() / 512.0 args.lr = linear_scaled_lr optimizer = create_optimizer(args, model_without_ddp) loss_scaler = NativeScaler() lr_scheduler, _ = create_scheduler(args, optimizer) criterion = LabelSmoothingCrossEntropy() if args.mixup > 0.: # smoothing is handled with mixup label transform criterion = SoftTargetCrossEntropy() elif args.smoothing: criterion = LabelSmoothingCrossEntropy(smoothing=args.smoothing) else: criterion = torch.nn.CrossEntropyLoss() teacher_model = None if args.distillation_type != 'none': assert args.teacher_path, 'need to specify teacher-path when using distillation' print(f"Creating teacher model: {args.teacher_model}") teacher_model = create_model( args.teacher_model, pretrained=False, num_classes=args.nb_classes, global_pool='avg', ) if args.teacher_path.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url( args.teacher_path, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.teacher_path, map_location='cpu') teacher_model.load_state_dict(checkpoint['model']) teacher_model.to(device) teacher_model.eval() # wrap the criterion in our custom DistillationLoss, which # just dispatches to the original criterion if args.distillation_type is 'none' criterion = DistillationLoss( criterion, teacher_model, args.distillation_type, args.distillation_alpha, args.distillation_tau ) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url( args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.model_ema: utils._load_checkpoint_for_ema(model_ema, checkpoint['model_ema']) if 'scaler' in checkpoint: loss_scaler.load_state_dict(checkpoint['scaler']) if args.eval: test_stats = evaluate(data_loader_val, model, device) print(f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%") return print(f"Start training for {args.epochs} epochs") start_time = time.time() max_accuracy = 0.0 for epoch in range(args.start_epoch, args.epochs): if args.distributed: data_loader_train.sampler.set_epoch(epoch) train_stats = train_one_epoch( model, criterion, data_loader_train, optimizer, device, epoch, loss_scaler, args.clip_grad, model_ema, mixup_fn, set_training_mode=args.finetune == '' # keep in eval mode during finetuning ) lr_scheduler.step(epoch) if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] for checkpoint_path in checkpoint_paths: utils.save_on_master({ 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'model_ema': get_state_dict(model_ema), 'scaler': loss_scaler.state_dict(), 'args': args, }, checkpoint_path) test_stats = evaluate(data_loader_val, model, device) print(f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%") max_accuracy = max(max_accuracy, test_stats["acc1"]) print(f'Max accuracy: {max_accuracy:.2f}%') log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters} if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def __main__(): args = get_args_parser() dist.init_process_group(backend='nccl') torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True set_random_seed(args.random_seed + dist.get_rank()) torch.cuda.set_device(torch.device('cuda:{}'.format(dist.get_rank()))) dist_logger = DistributedLogger(args.name, args.output_base_path, args.master_rank, args.use_tensorboard) train_dataset = TrainDataset(args.dataset_root, args.dataset_year, (args.input_size_h, args.input_size_w), args.pooler_size) train_sampler = data.distributed.DistributedSampler(train_dataset) train_dataloader = data.DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers, sampler=train_sampler, pin_memory=True, drop_last=True) val_dataset = ValDataset(args.dataset_root, args.dataset_year, (args.input_size_h, args.input_size_w)) val_sampler = data.distributed.DistributedSampler(val_dataset) val_dataloader = data.DataLoader(val_dataset, batch_size=args.batch_size, num_workers=args.num_workers, pin_memory=True, sampler=val_sampler) model = BlendMask(len(COCO_CLASSES), args.fpn_channels, args.bases_module_channels, args.num_bases, args.atten_size, args.pooler_size).cuda() # model.load_state_dict(torch.load(f'./output/{args.name}/model/param.pth')) model = parallel.DistributedDataParallel(model, device_ids=[dist.get_rank()], find_unused_parameters=True) criterion = Criterion(args.focal_alpha, args.focal_gamma) optim_parameters = [{ 'params': [ p for n, p in model.module.named_parameters() if not n.endswith('bias') and p.requires_grad ] }, { 'params': [ p for n, p in model.module.named_parameters() if n.endswith('bias') and p.requires_grad ], 'lr': args.lr * args.bias_lr_mul, 'weight_decay': args.weight_decay * args.bias_weight_decay_mul }] optimizer = optim.SGD(optim_parameters, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) lr_lambda = utils.lr_lambda.get_warm_up_multi_step_lr_lambda( len(train_dataloader), args.warm_up_epoch, args.warm_up_ratio, args.milestones, args.step_gamma) lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda) nms_cfg = { 'nms_pre': args.nms_pre, 'cls_score_thr': args.nms_cls_score_thr, 'iou_thr': args.nms_iou_thr } for epoch_idx in range(args.epochs): train_sampler.set_epoch(epoch_idx) val_sampler.set_epoch(epoch_idx) engine.train_one_epoch(model, criterion, optimizer, lr_scheduler, train_dataloader, epoch_idx, dist_logger)
def main(): # train on the GPU or on the CPU, if a GPU is not available device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') # our dataset has two classes only - background and person num_classes = 2 # use our dataset and defined transformations output_folder = 'E:/PYTORCH-MASK/' dataset_train = LidarDataset('E:/PYTORCH-MASK/TRAIN-FINAL/') dataset_test = LidarDataset('E:/PYTORCH-MASK/TEST-FINAL/') # define training and validation data loaders data_loader_train = torch.utils.data.DataLoader( dataset_train, batch_size=1, shuffle=True, num_workers=4, collate_fn=utils.collate_fn) data_loader_test = torch.utils.data.DataLoader(dataset_test, batch_size=1, shuffle=False, num_workers=4, collate_fn=utils.collate_fn) # get the model using our helper function model = get_model_instance_segmentation(num_classes, input_channel=3) # move model to the right device model.to(device) # construct an optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) # and a learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1) # let's train it for 10 epochs num_epochs = 3 for epoch in range(num_epochs): # train for one epoch, printing every 10 iterations train_one_epoch(model, optimizer, data_loader_train, device, epoch, print_freq=100) # update the learning rate lr_scheduler.step() # evaluate on the test dataset evaluate(model, data_loader_test, device=device) torch.save( model.state_dict(), os.path.join(output_folder, 'model_final_{}.pth'.format(epoch))) print("Model is trained")
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) wandb.init(project="qpic-project", entity="sangbaeklee", group="experiment_qpic") wandb.config = { "learning_rate": args.lr, "epochs": args.epochs, "batch_size": args.batch_size, } if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) wandb.watch(model) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) dataset_train = build_dataset(image_set='train', args=args) dataset_val = build_dataset(image_set='val', args=args) if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) if not args.hoi: if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 elif args.pretrained: checkpoint = torch.load(args.pretrained, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model'], strict=False) if args.eval: if args.hoi: test_stats = evaluate_hoi(args.dataset_file, model, postprocessors, data_loader_val, args.subject_category_id, device) return else: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) if args.hoi: test_stats = evaluate_hoi(args.dataset_file, model, postprocessors, data_loader_val, args.subject_category_id, device) coco_evaluator = None else: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } #import pdb; pdb.set_trace() if args.dataset_file == 'hico': wandb.log({ "loss": train_stats['loss'], "mAP": test_stats['mAP'], "mAP rare": test_stats['mAP rare'], "mAP non-rare": test_stats['mAP non-rare'], "mean max recall": test_stats['mean max recall'] }) elif args.dataset_file == 'vcoco': wandb.log({ "mAP_all": test_stats['mAP_all'], "mAP_thesis": test_stats['mAP_thesis'], "AP_hold_obj": test_stats['AP_hold_obj'], "AP_stand": test_stats['AP_stand'], "AP_sit_instr": test_stats['AP_sit_instr'], "AP_ride_instr": test_stats['AP_ride_instr'], "AP_walk": test_stats['AP_walk'], "AP_look_obj": test_stats['AP_look_obj'], "AP_hit_instr": test_stats['AP_hit_instr'], "AP_hit_obj": test_stats['AP_hit_obj'], "AP_eat_obj": test_stats['AP_eat_obj'], "AP_eat_instr": test_stats['AP_eat_instr'], "AP_jump_instr": test_stats['AP_jump_instr'], "AP_lay_instr": test_stats['AP_lay_instr'], "AP_talk_on_phone_instr": test_stats['AP_talk_on_phone_instr'], "AP_carry_obj": test_stats['AP_carry_obj'], "AP_throw_obj": test_stats['AP_throw_obj'], "AP_catch_obj": test_stats['AP_catch_obj'], "AP_cut_instr": test_stats['AP_cut_instr'], "AP_cut_obj": test_stats['AP_cut_obj'], "AP_run": test_stats['AP_run'], "AP_work_on_computer_instr": test_stats['AP_work_on_computer_instr'], "AP_ski_instr": test_stats['AP_ski_instr'], "AP_surf_instr": test_stats['AP_surf_instr'], "AP_skateboard_instr": test_stats['AP_skateboard_instr'], "AP_smile": test_stats['AP_smile'], "AP_drink_instr": test_stats['AP_drink_instr'], "AP_kick_obj": test_stats['AP_kick_obj'], "AP_point_instr": test_stats['AP_point_instr'], "AP_read_obj": test_stats['AP_read_obj'], "AP_snowboard_instr": test_stats['AP_snowboard_instr'],\ "loss" : train_stats['loss'] }) else: continue if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # for evaluation logs if coco_evaluator is not None: (output_dir / 'eval').mkdir(exist_ok=True) if "bbox" in coco_evaluator.coco_eval: filenames = ['latest.pth'] if epoch % 50 == 0: filenames.append(f'{epoch:03}.pth') for name in filenames: torch.save(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval" / name) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) # random.seed(seed) cudnn.benchmark = True dataset_train, args.nb_classes = build_dataset(is_train=True, args=args) dataset_val, _ = build_dataset(is_train=False, args=args) if True: # args.distributed: num_tasks = utils.get_world_size() global_rank = utils.get_rank() if args.repeated_aug: sampler_train = RASampler(dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True) else: sampler_train = torch.utils.data.DistributedSampler( dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) data_loader_train = torch.utils.data.DataLoader( dataset_train, sampler=sampler_train, batch_size=args.batch_size, num_workers=args.num_workers, pin_memory=args.pin_mem, drop_last=True, ) data_loader_val = torch.utils.data.DataLoader(dataset_val, batch_size=int( 1.5 * args.batch_size), shuffle=False, num_workers=args.num_workers, pin_memory=args.pin_mem, drop_last=False) mixup_fn = None mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None if mixup_active: mixup_fn = Mixup(mixup_alpha=args.mixup, cutmix_alpha=args.cutmix, cutmix_minmax=args.cutmix_minmax, prob=args.mixup_prob, switch_prob=args.mixup_switch_prob, mode=args.mixup_mode, label_smoothing=args.smoothing, num_classes=args.nb_classes) print(f"Creating model: {args.model}") model = create_model( args.model, pretrained=False, num_classes=args.nb_classes, drop_rate=args.drop, drop_path_rate=args.drop_path, drop_block_rate=args.drop_block, ) # TODO: finetuning model.to(device) model_ema = None if args.model_ema: # Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper model_ema = ModelEma(model, decay=args.model_ema_decay, device='cpu' if args.model_ema_force_cpu else '', resume='') model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) linear_scaled_lr = args.lr * args.batch_size * utils.get_world_size( ) / 512.0 args.lr = linear_scaled_lr optimizer = create_optimizer(args, model) loss_scaler = NativeScaler() lr_scheduler, _ = create_scheduler(args, optimizer) criterion = LabelSmoothingCrossEntropy() if args.mixup > 0.: # smoothing is handled with mixup label transform criterion = SoftTargetCrossEntropy() elif args.smoothing: criterion = LabelSmoothingCrossEntropy(smoothing=args.smoothing) else: criterion = torch.nn.CrossEntropyLoss() output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.model_ema: utils._load_checkpoint_for_ema(model_ema, checkpoint['model_ema']) if args.eval: test_stats = evaluate(data_loader_val, model, device) print( f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%" ) return print("Start training") start_time = time.time() max_accuracy = 0.0 for epoch in range(args.start_epoch, args.epochs): if args.distributed: data_loader_train.sampler.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, loss_scaler, args.clip_grad, model_ema, mixup_fn) lr_scheduler.step(epoch) if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'model_ema': get_state_dict(model_ema), 'args': args, }, checkpoint_path) test_stats = evaluate(data_loader_val, model, device) print( f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%" ) max_accuracy = max(max_accuracy, test_stats["acc1"]) print(f'Max accuracy: {max_accuracy:.2f}%') log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(): # 학습을 GPU로 진행하되 GPU가 가용하지 않으면 CPU로 합니다 device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') # 우리 데이터셋은 두 개의 클래스만 가집니다 - 배경과 사람 num_classes = 2 # 데이터셋과 정의된 변환들을 사용합니다 dataset = PennFudanDataset('PennFudanPed', get_transform(train=True)) dataset_test = PennFudanDataset('PennFudanPed', get_transform(train=False)) # 데이터셋을 학습용과 테스트용으로 나눕니다(역자주: 여기서는 전체의 50개를 테스트에, 나머지를 학습에 사용합니다) indices = torch.randperm(len(dataset)).tolist() dataset = torch.utils.data.Subset(dataset, indices[:-50]) dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:]) # 데이터 로더를 학습용과 검증용으로 정의합니다 data_loader = torch.utils.data.DataLoader(dataset, batch_size=2, shuffle=True, num_workers=4, collate_fn=utils.collate_fn) data_loader_test = torch.utils.data.DataLoader(dataset_test, batch_size=1, shuffle=False, num_workers=4, collate_fn=utils.collate_fn) # 도움 함수를 이용해 모델을 가져옵니다 model = get_model_instance_segmentation(num_classes) # 모델을 GPU나 CPU로 옮깁니다 model.to(device) # 옵티마이저(Optimizer)를 만듭니다 params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) # 학습률 스케쥴러를 만듭니다 lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1) # 10 에포크만큼 학습해봅시다 num_epochs = 10 for epoch in range(num_epochs): # 1 에포크동안 학습하고, 10회 마다 출력합니다 train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10) # 학습률을 업데이트 합니다 lr_scheduler.step() # 테스트 데이터셋에서 평가를 합니다 evaluate(model, data_loader_test, device=device) print("That's it!")
def main(args): bz = args.batch_size lr = args.lr if args.cuda: if torch.cuda.device_count() >= 1: utils.init_distributed_mode(args) device = torch.device(args.device) else: device = torch.device('cpu') # fix the seed for reproducibility if args.cuda: seed = args.seed + utils.get_rank() else: seed = args.seed torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) # set up model model, criterion, postprocessors = build_model(args) model_without_ddp = model if args.cuda and args.distributed: if args.mp: model = torch.nn.parallel.DistributedDataParallel(model) else: model = torch.nn.parallel.DistributedDataParallel( model.to(args.gpu), device_ids=[args.gpu], find_unused_parameters=True) model_without_ddp = model.module elif args.cuda: model = model.to(device) n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) # set up model training param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "joiner" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "joiner" in n and p.requires_grad ], "lr": args.lr_joiner, }, ] # datasets build dataset_train = build_dataset(mode="training", args=args) dataset_test = build_dataset(mode="testing", args=args) if args.cuda and args.distributed: sampler_train = DistributedSampler(dataset_train, shuffle=False) sampler_test = DistributedSampler(dataset_test, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_test = torch.utils.data.SequentialSampler(dataset_test) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) data_loader_test = DataLoader(dataset_test, 1, sampler=sampler_test, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) # output and checkpoints directory checkpoint_dir = args.output_dir if not os.path.exists(checkpoint_dir): try: os.makedirs(checkpoint_dir) except OSError: pass if args.resume: checkpoint = Path(args.resume) assert checkpoint.exists() checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 print("Start Training") start_time = time.time() optimizer.zero_grad() for epoch in range(args.start_epoch, args.epochs): if args.cuda and args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(epoch, args.clip_max_norm, model, criterion, data_loader_train, optimizer, lr_scheduler, device) if args.output_dir: checkpoint_dir = Path(checkpoint_dir) checkpoint_paths = [checkpoint_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.lr_drop == 0 or ( epoch + 1) % args.save_checkpoint_every == 0: checkpoint_paths.append(checkpoint_dir / f'checkpoint{epoch:05}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) if (epoch + 1) % args.eval_interval == 0: # evaluation test_stats = evaluate(epoch, model, criterion, postprocessors, data_loader_test, args.output_dir, args.dataset, device) log_stats = { **{'train_' + str(k): v for k, v in train_stats.items()}, **{'test_' + str(k): v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if args.output_dir and utils.is_main_process(): with (checkpoint_dir / 'log.json').open("a") as f: f.write(json.dumps(log_stats) + "\n") lr_scheduler.step() total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): if args.output_dir: utils.mkdir(args.output_dir) utils.init_distributed_mode(args) print(args) device = torch.device(args.device) if args.use_deterministic_algorithms: torch.use_deterministic_algorithms(True) # Data loading code print("Loading data") dataset, num_classes = get_dataset(args.dataset, "train", get_transform(True, args), args.data_path) dataset_test, _ = get_dataset(args.dataset, "val", get_transform(False, args), args.data_path) print("Creating data loaders") if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( dataset) test_sampler = torch.utils.data.distributed.DistributedSampler( dataset_test, shuffle=False) else: train_sampler = torch.utils.data.RandomSampler(dataset) test_sampler = torch.utils.data.SequentialSampler(dataset_test) if args.aspect_ratio_group_factor >= 0: group_ids = create_aspect_ratio_groups( dataset, k=args.aspect_ratio_group_factor) train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size) else: train_batch_sampler = torch.utils.data.BatchSampler(train_sampler, args.batch_size, drop_last=True) data_loader = torch.utils.data.DataLoader( dataset, batch_sampler=train_batch_sampler, num_workers=args.workers, collate_fn=utils.collate_fn) data_loader_test = torch.utils.data.DataLoader(dataset_test, batch_size=1, sampler=test_sampler, num_workers=args.workers, collate_fn=utils.collate_fn) print("Creating model") kwargs = {"trainable_backbone_layers": args.trainable_backbone_layers} if args.data_augmentation in ["multiscale", "lsj"]: kwargs["_skip_resize"] = True if "rcnn" in args.model: if args.rpn_score_thresh is not None: kwargs["rpn_score_thresh"] = args.rpn_score_thresh model = torchvision.models.detection.__dict__[args.model]( weights=args.weights, weights_backbone=args.weights_backbone, num_classes=num_classes, **kwargs) model.to(device) if args.distributed and args.sync_bn: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module if args.norm_weight_decay is None: parameters = [p for p in model.parameters() if p.requires_grad] else: param_groups = torchvision.ops._utils.split_normalization_params(model) wd_groups = [args.norm_weight_decay, args.weight_decay] parameters = [{ "params": p, "weight_decay": w } for p, w in zip(param_groups, wd_groups) if p] opt_name = args.opt.lower() if opt_name.startswith("sgd"): optimizer = torch.optim.SGD( parameters, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov="nesterov" in opt_name, ) elif opt_name == "adamw": optimizer = torch.optim.AdamW(parameters, lr=args.lr, weight_decay=args.weight_decay) else: raise RuntimeError( f"Invalid optimizer {args.opt}. Only SGD and AdamW are supported.") scaler = torch.cuda.amp.GradScaler() if args.amp else None args.lr_scheduler = args.lr_scheduler.lower() if args.lr_scheduler == "multisteplr": lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=args.lr_steps, gamma=args.lr_gamma) elif args.lr_scheduler == "cosineannealinglr": lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=args.epochs) else: raise RuntimeError( f"Invalid lr scheduler '{args.lr_scheduler}'. Only MultiStepLR and CosineAnnealingLR are supported." ) if args.resume: checkpoint = torch.load(args.resume, map_location="cpu") model_without_ddp.load_state_dict(checkpoint["model"]) optimizer.load_state_dict(checkpoint["optimizer"]) lr_scheduler.load_state_dict(checkpoint["lr_scheduler"]) args.start_epoch = checkpoint["epoch"] + 1 if args.amp: scaler.load_state_dict(checkpoint["scaler"]) if args.test_only: torch.backends.cudnn.deterministic = True evaluate(model, data_loader_test, device=device) return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) train_one_epoch(model, optimizer, data_loader, device, epoch, args.print_freq, scaler) lr_scheduler.step() if args.output_dir: checkpoint = { "model": model_without_ddp.state_dict(), "optimizer": optimizer.state_dict(), "lr_scheduler": lr_scheduler.state_dict(), "args": args, "epoch": epoch, } if args.amp: checkpoint["scaler"] = scaler.state_dict() utils.save_on_master( checkpoint, os.path.join(args.output_dir, f"model_{epoch}.pth")) utils.save_on_master( checkpoint, os.path.join(args.output_dir, "checkpoint.pth")) # evaluate after every epoch evaluate(model, data_loader_test, device=device) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print(f"Training time {total_time_str}")
def main(args): utils.init_distributed_mode(args) print(args) device = torch.device(args.device) # Data loading code print("Loading data") # 支持加载自定义Pascal格式数据集 参数dataset设置为custom_voc if args.dataset == 'custom_voc': # dataset, num_classes = get_custom_voc(args.train_data_path,get_transform(train=True)) # dataset_test, _ = get_custom_voc(args.test_data_path,get_transform(train=False)) # 如果是自定义Pascal数据集,不需要传入image_set参数,因此这里设置为None dataset, num_classes = get_dataset(args.dataset, None, get_transform(train=True), args.train_data_path) dataset_test, _ = get_dataset(args.dataset, None, get_transform(train=False), args.test_data_path) else: dataset, num_classes = get_dataset( args.dataset, "train" if args.dataset == 'coco' else 'trainval', get_transform(train=True), args.data_path) dataset_test, _ = get_dataset( args.dataset, "test" if args.dataset == 'coco' else 'val', get_transform(train=False), args.data_path) print("Creating data loaders") if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( dataset) test_sampler = torch.utils.data.distributed.DistributedSampler( dataset_test) else: train_sampler = torch.utils.data.RandomSampler(dataset) test_sampler = torch.utils.data.SequentialSampler(dataset_test) if args.aspect_ratio_group_factor >= 0: group_ids = create_aspect_ratio_groups( dataset, k=args.aspect_ratio_group_factor) train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size) else: train_batch_sampler = torch.utils.data.BatchSampler(train_sampler, args.batch_size, drop_last=True) data_loader = torch.utils.data.DataLoader( dataset, batch_sampler=train_batch_sampler, num_workers=args.workers, collate_fn=utils.collate_fn) data_loader_test = torch.utils.data.DataLoader(dataset_test, batch_size=1, sampler=test_sampler, num_workers=args.workers, collate_fn=utils.collate_fn) print("Creating model") # model = torchvision.models.detection.fasterrcnn_resnet50_fpn() model = torchvision.models.detection.__dict__[args.model]( num_classes=num_classes, pretrained=args.pretrained) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=args.lr_steps, gamma=args.lr_gamma) if args.resume: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) optimizer.load_state_dict( checkpoint['optimizer']) # 用于恢复训练,处理模型还需要优化器和学习率规则 lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) # 如果只进行模型测试,注意这里传入的参数是--resume, 原作者只提到了--resume用于恢复训练,根据官方文档可知也是可以用于模型推理的 # 参考官方文档https://pytorch.org/tutorials/beginner/saving_loading_models.html if args.test_only: if not args.resume: raise Exception('需要checkpoints模型用于推理!') else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if 'coco' == args.dataset: coco_evaluate(model_without_ddp, data_loader_test, device=device) elif 'voc' == args.dataset: voc_evaluate(model_without_ddp, data_loader_test, device=device) elif 'custom_voc' == args.dataset: custom_voc_evaluate(model_without_ddp, data_loader_test, device=device) else: print( f'No evaluation method available for the dataset {args.dataset}' ) # evaluate(model, data_loader_test, device=device) return print("Start training") start_time = time.time() for epoch in range(args.epochs): if args.distributed: train_sampler.set_epoch(epoch) train_one_epoch(model, optimizer, data_loader, device, epoch, args.print_freq) lr_scheduler.step() if args.output_dir: # model.save('./checkpoints/model_{}_{}.pth'.format(args.dataset, epoch)) utils.save_on_master( { 'model': model_without_ddp.state_dict(), # 存储网络参数(不存储网络骨架) # 'model': model_without_ddp, # 存储整个网络 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'args': args }, os.path.join(args.output_dir, 'model_{}_{}.pth'.format(args.dataset, epoch))) # evaluate after every epoch if args.dataset == 'coco': coco_evaluate(model, data_loader_test, device=device) elif 'voc' == args.dataset: voc_evaluate(model, data_loader_test, device=device) elif 'custom_voc' == args.dataset: custom_voc_evaluate(model, data_loader_test, device=device) else: print( f'No evaluation method available for the dataset {args.dataset}' ) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(): # train on the GPU or on the CPU, if a GPU is not available device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') # our dataset has two classes only - background and person num_classes = 2 # use our dataset and defined transformations dataset = PennFudanDataset('PennFudanPed', get_transform(train=True)) dataset_test = PennFudanDataset('PennFudanPed', get_transform(train=False)) # split the dataset in train and test set indices = torch.randperm(len(dataset)).tolist() dataset = torch.utils.data.Subset(dataset, indices[:-50]) dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:]) # define training and validation data loaders data_loader = torch.utils.data.DataLoader(dataset, batch_size=2, shuffle=True, num_workers=4, collate_fn=utils.collate_fn) data_loader_test = torch.utils.data.DataLoader(dataset_test, batch_size=1, shuffle=False, num_workers=4, collate_fn=utils.collate_fn) # get the model using our helper function model = get_model_instance_segmentation(num_classes) # move model to the right device model.to(device) # construct an optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) # and a learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1) # let's train it for 10 epochs num_epochs = 10 for epoch in range(num_epochs): # train for one epoch, printing every 10 iterations train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10) # update the learning rate lr_scheduler.step() # evaluate on the test dataset evaluate(model, data_loader_test, device=device) # model.save('mask_R_CNN.pth') torch.save(model.state_dict(), 'mask_R_CNN.pth') # model = torch.load(r'mask_R_CNN.pth') # pick one image from the test set img, _ = dataset_test[0] # put the model in evaluation mode model.eval() with torch.no_grad(): prediction = model([img.to(device)]) print(prediction) im = Image.fromarray(img.mul(255).permute(1, 2, 0).byte().numpy()) Image._show(im) im2 = Image.fromarray( prediction[0]['masks'][0, 0].mul(255).byte().cpu().numpy()) Image._show(im2) print("That's it!")
def main(): """""" # DDP wrapping parser = ArgumentParser('The Window DDP') parser.add_argument('--local_rank', type=int, default=-1, metavar='N', help='Local process rank.') args = parser.parse_args() args.is_master = args.local_rank == 0 args.device = torch.cuda.device(args.local_rank) dist.init_process_group(backend='nccl', init_method='env://') torch.cuda.set_device(args.local_rank) torch.cuda.manual_seed_all(seed) # Import a pretrained Faster R-CNN with a ResNet-50 backbone. model = \ torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True) # Freeze weights for param in model.parameters(): param.requires_grad = False # Attach a fresh head to train on UBIRISPr in_features = model.roi_heads.box_predictor.cls_score.in_features model.roi_heads.box_predictor = FastRCNNPredictor(in_features,num_classes) # Send to device. model = model.to(args.local_rank) # Wrap the Faster R-CNN in the DDP. ddp_model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank) # Build Datasets using the helper document build by csvBuilder train_set = UBIRISPrDataset(r'../data/Train_Set_small.csv') test_set = UBIRISPrDataset(r'../data/Test_Set_small.csv') #dataset = UBIRISPrDataset(r'../data/UBIRISPr_Labels_small.csv')` #train_set, test_set = random_split(dataset, [2000, 500]) # Define collate function and samplers for DDP. def collate_fn(batch): return tuple(zip(*batch)) train_sampler = DistributedSampler(train_set) test_sampler = DistributedSampler(test_set) #sampler = DistributedSampler(dataset) # Build Train and Test DataLoaders. train_load = DataLoader(train_set, batch_size=batch_size, num_workers=num_workers, pin_memory=True, collate_fn=collate_fn, sampler=train_sampler) test_load = DataLoader(test_set, batch_size=batch_size, num_workers=num_workers, pin_memory=True, collate_fn=collate_fn, sampler=test_sampler) # Optimizer: Stochastic Gradient Descent only optimize the head. params = [p for p in ddp_model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, # Need more efficient access method. lr=learning_rate, momentum=0.9, weight_decay=weight_decay) # Learning Rate Scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=gamma) # Trained using the PyTorch Vision framework. for epoch in range(num_epochs): # Train for one epoch, printing every 10 iterations. train_one_epoch(ddp_model, optimizer, train_load, args.local_rank, epoch, print_freq=10) # update the learning rate lr_scheduler.step() # evaluate on the test dataset. evaluate(ddp_model, test_load, device=[args.local_rank]) # Save the model torch.save(ddp_model.state_dict(), 'model.pth') torch.save({'epoch': epoch, 'model_state_dict': ddp_model.state_dict(), 'opitimizer_state_dict': optimizer.state_dict(), }, 'ckpt.pth')
def main(network): # train on the GPU or on the CPU, if a GPU is not available device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') # our dataset has two classes only - background and person num_classes = 2 #dataset = torch.utils.data.Subset(TBdata,[range(len(TBdata))]) indices = torch.randperm(len(TBdata)).tolist() dataset = torch.utils.data.Subset(TBdata, indices[:]) indices_ = torch.randperm(len(TBdata_test)).tolist() dataset_val = torch.utils.data.Subset(TBdata_test, indices_[:]) # get the model using our helper function #model = get_model_instance_segmentation(num_classes) dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, sampler=None, num_workers=0, collate_fn=collate_fn) dataloader_val = torch.utils.data.DataLoader(dataset_val, batch_size=8, sampler=None, num_workers=0, collate_fn=collate_fn) #Calculated statistics on training data: #Transform parameters min_size = 550 max_size = 700 image_means = [0.9492, 0.9492, 0.9492] image_stds = [0.1158, 0.1158, 0.1158] if network == 'resnet50': backbone = resnet_fpn_backbone('resnet50', True) model = FasterRCNN(backbone, num_classes, min_size=min_size, max_size=max_size, image_mean=image_means, image_std=image_stds) elif network == 'resnet18': backbone = resnet_fpn_backbone('resnet18', True) model = FasterRCNN(backbone, num_classes, min_size=min_size, max_size=max_size, image_mean=image_means, image_std=image_stds) elif network == 'resnet152': backbone = resnet_fpn_backbone('resnet152', True) model = FasterRCNN(backbone, num_classes, min_size=min_size, max_size=max_size, image_mean=image_means, image_std=image_stds) elif network == 'RPNresnet50': backbone = resnet_fpn_backbone('resnet50', True) model = RPN_custom(backbone, num_classes, min_size=min_size, max_size=max_size, image_mean=image_means, image_std=image_stds) elif network == 'RPNresnet152': backbone = resnet_fpn_backbone('resnet152', True) model = RPN_custom(backbone, num_classes, min_size=min_size, max_size=max_size, image_mean=image_means, image_std=image_stds) # move model to the right device model.to(device) # construct an optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) # and a learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1) num_epochs = 10 Ls = { 'total loss': [], 'loss_classifier': [], 'loss_box_reg': [], 'loss_objectness': [], 'loss_rpn_box_reg': [] } Ls_val = { 'total loss': [], 'loss_classifier': [], 'loss_box_reg': [], 'loss_objectness': [], 'loss_rpn_box_reg': [] } for epoch in range(num_epochs): # train for one epoch, printing every 10 iterations train_one_epoch(model, optimizer, dataloader, device, epoch, print_freq=10) # update the learning rate lr_scheduler.step() # evaluate on the test dataset #evaluate(model, dataloader_test, device=device) Ls_val = record_losses(model, dataloader_val, device, Ls_val, network) #record losses Ls = record_losses(model, dataloader, device, Ls, network) #If folder does not exist already, create it output_loc = f'./{network}/' if not os.path.exists(output_loc): os.makedirs(output_loc) torch.save(model.state_dict(), output_loc + 'model.pt') print("That's it!") return Ls, Ls_val, num_epochs
def main(args): if args.output_dir: utils.mkdir(args.output_dir) utils.init_distributed_mode(args) print(args) device = torch.device(args.device) # Data loading code print("Loading data") dataset, num_classes = get_dataset( args.dataset, "train", get_transform(True, args.data_augmentation), args.data_path) dataset_test, _ = get_dataset(args.dataset, "val", get_transform(False, args.data_augmentation), args.data_path) print("Creating data loaders") if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( dataset) test_sampler = torch.utils.data.distributed.DistributedSampler( dataset_test) else: train_sampler = torch.utils.data.RandomSampler(dataset) test_sampler = torch.utils.data.SequentialSampler(dataset_test) if args.aspect_ratio_group_factor >= 0: group_ids = create_aspect_ratio_groups( dataset, k=args.aspect_ratio_group_factor) train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size) else: train_batch_sampler = torch.utils.data.BatchSampler(train_sampler, args.batch_size, drop_last=True) data_loader = torch.utils.data.DataLoader( dataset, batch_sampler=train_batch_sampler, num_workers=args.workers, collate_fn=utils.collate_fn) data_loader_test = torch.utils.data.DataLoader(dataset_test, batch_size=1, sampler=test_sampler, num_workers=args.workers, collate_fn=utils.collate_fn) print("Creating model") kwargs = {"trainable_backbone_layers": args.trainable_backbone_layers} if "rcnn" in args.model: if args.rpn_score_thresh is not None: kwargs["rpn_score_thresh"] = args.rpn_score_thresh model = torchvision.models.detection.__dict__[args.model]( num_classes=num_classes, pretrained=args.pretrained, **kwargs) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=args.lr_steps, gamma=args.lr_gamma) if args.resume: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.test_only: evaluate(model, data_loader_test, device=device) return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) train_one_epoch(model, optimizer, data_loader, device, epoch, args.print_freq) lr_scheduler.step() if args.output_dir: checkpoint = { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'args': args, 'epoch': epoch } utils.save_on_master( checkpoint, os.path.join(args.output_dir, 'model_{}.pth'.format(epoch))) utils.save_on_master( checkpoint, os.path.join(args.output_dir, 'checkpoint.pth')) # evaluate after every epoch evaluate(model, data_loader_test, device=device) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) Dataset = get_dataset(args.dataset, args.task) f = open(args.data_cfg) data_config = json.load(f) trainset_paths = data_config['train'] dataset_root = data_config['root'] f.close() normalize = T.Compose([ T.ToTensor(), T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800] transforms = T.Compose([ T.RandomHorizontalFlip(), T.RandomSelect( T.RandomResize(scales, max_size=1333), T.Compose([ T.RandomResize([400, 500, 600]), T.RandomSizeCrop(384, 600), # T.RandomSizeCrop_MOT(384, 600), T.RandomResize(scales, max_size=1333), ])), normalize, ]) dataset_train = Dataset(args, dataset_root, trainset_paths, (1088, 608), augment=True, transforms=transforms) args.nID = dataset_train.nID model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model # dataset_train = build_dataset(image_set='train', args=args) # dataset_val = build_dataset(image_set='val', args=args) if args.distributed: if args.cache_mode: sampler_train = samplers.NodeDistributedSampler(dataset_train) # sampler_val = samplers.NodeDistributedSampler(dataset_val, shuffle=False) else: sampler_train = samplers.DistributedSampler(dataset_train) # sampler_val = samplers.DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) # sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True) # data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, # drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers, # pin_memory=True) # data_loader_train = torch.utils.data.DataLoader( # dataset_train, # batch_size=args.batch_size, # shuffle=True, # num_workers=args.num_workers, # pin_memory=True, # drop_last=True # ) # lr_backbone_names = ["backbone.0", "backbone.neck", "input_proj", "transformer.encoder"] def match_name_keywords(n, name_keywords): out = False for b in name_keywords: if b in n: out = True break return out for n, p in model_without_ddp.named_parameters(): print(n) # 用于将classifer不更新参数 # for name,p in model_without_ddp.named_parameters(): # if name.startswith('classifier'): # p.requires_grad = False param_dicts = [{ "params": [ p for n, p in model_without_ddp.named_parameters() if not match_name_keywords(n, args.lr_backbone_names) and not match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad ], "lr": args.lr, }, { "params": [ p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_backbone_names) and p.requires_grad ], "lr": args.lr_backbone, }, { "params": [ p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad ], "lr": args.lr * args.lr_linear_proj_mult, }] if args.sgd: optimizer = torch.optim.SGD(param_dicts, lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) else: optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) # optimizer.add_param_group({'params': criterion.parameters()}) if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_dict = model_without_ddp.state_dict() #当前模型参数 pretrained_dict = { k: v for k, v in checkpoint['model'].items() if k not in [ "class_embed.0.weight", "class_embed.0.bias", "class_embed.1.weight", "class_embed.1.bias", "class_embed.2.weight", "class_embed.2.bias", "class_embed.3.weight", "class_embed.3.bias", "class_embed.4.weight", "class_embed.4.bias", "class_embed.5.weight", "class_embed.5.bias" ] } model_dict.update(pretrained_dict) # missing_keys, unexpected_keys = model_without_ddp.load_state_dict(checkpoint['model'], strict=False) missing_keys, unexpected_keys = model_without_ddp.load_state_dict( model_dict, strict=False) unexpected_keys = [ k for k in unexpected_keys if not (k.endswith('total_params') or k.endswith('total_ops')) ] if len(missing_keys) > 0: print('Missing Keys: {}'.format(missing_keys)) if len(unexpected_keys) > 0: print('Unexpected Keys: {}'.format(unexpected_keys)) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: args.start_epoch = checkpoint['epoch'] + 1 # optimizer.load_state_dict(checkpoint['optimizer']) # if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: # import copy # p_groups = copy.deepcopy(optimizer.param_groups) # # optimizer.load_state_dict(checkpoint['optimizer']) # for pg, pg_old in zip(optimizer.param_groups, p_groups): # pg['lr'] = pg_old['lr'] # pg['initial_lr'] = pg_old['initial_lr'] # # print(optimizer.param_groups) # lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) # # todo: this is a hack for doing experiment that resume from checkpoint and also modify lr scheduler (e.g., decrease lr in advance). # args.override_resumed_lr_drop = True # if args.override_resumed_lr_drop: # print('Warning: (hack) args.override_resumed_lr_drop is set to True, so args.lr_drop would override lr_drop in resumed lr_scheduler.') # lr_scheduler.step_size = args.lr_drop # lr_scheduler.base_lrs = list(map(lambda group: group['initial_lr'], optimizer.param_groups)) # lr_scheduler.step(lr_scheduler.last_epoch) # model.add_module('id') # [p for p in model.named_parameters() if not p[1].requires_grad] # 用于将classifer不更新参数 # optimizer = torch.optim.SGD(filter(lambda x: "classifier" not in x[0], model.parameters()), lr=args.lr, # momentum=0.9, weight_decay=1e-4) # model.classifier.training = False n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(args, model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 5 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 5 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print(args) device = torch.device(args.device) # Data loading code print('Loading data') dataset_train = build_dataset(args.train_set, args.dataset_year, args) dataset_val = build_dataset(args.val_set, args.dataset_year, args) base_ds = get_coco_api_from_dataset(dataset_val) print('Creating data loaders') if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler( sampler_train, args.batch_size, drop_last=True, ) data_loader_train = DataLoader( dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers, ) data_loader_val = DataLoader( dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers, ) print('Creating model, always set args.return_criterion be True') args.return_criterion = True model = yolov5s(num_classes=args.num_classes) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu], ) model_without_ddp = model.module params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD( params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, ) if args.lr_scheduler == 'cosine': lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, args.t_max) elif args.lr_scheduler == 'multi-step': lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=args.lr_steps, gamma=args.lr_gamma, ) else: raise ValueError(f'scheduler {args.lr_scheduler} not supported') output_dir = Path(args.output_dir) if args.resume: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.test_only: evaluate(model, data_loader_val, base_ds, device) return print('Start training') start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_one_epoch(model, optimizer, data_loader_train, device, epoch, args.print_freq) lr_scheduler.step() if args.output_dir: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'args': args, 'epoch': epoch, }, output_dir.joinpath(f'model_{epoch}.pth'), ) # evaluate after every epoch # evaluate(model, criterion, data_loader_val, device=device) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print(f'Training time {total_time_str}')
weight_decay=0.0005) # and a learning rate scheduler which decreases the learning rate by # 10x every 3 epochs lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1) # let's train it for 10 epochs num_epochs = 5 for epoch in range(num_epochs): # train for one epoch, printing every 10 iterations train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10) # update the learning rate lr_scheduler.step() # evaluate on the test dataset evaluate(model, data_loader_test, device=device) torch.save(model.state_dict(), "pretrained_true11.pth") checkpoint = torch.load("pretrained_true11.pth") model.load_state_dict(checkpoint) import matplotlib import matplotlib.pyplot as plt
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model if args.distributed: tmp_model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) model_without_ddp = tmp_model.module n_parameters = sum(p.numel() for p in tmp_model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ {"params": [p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad]}, { "params": [p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) dataset_train = build_dataset(image_set='train', args=args) dataset_val = build_dataset(image_set='val', args=args) # SG: Making mixed precision a command line optional step if args.mixed_precision : print("Mixed Precision Training Selected.") model, optimizer = amp.initialize(model, optimizer, opt_level="O0") model = apex.parallel.DistributedDataParallel(model) if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler( sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url( args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.eval: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch( model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master({ 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) test_stats, coco_evaluator = evaluate( model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir ) log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters} if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # for evaluation logs if coco_evaluator is not None: (output_dir / 'eval').mkdir(exist_ok=True) if "bbox" in coco_evaluator.coco_eval: filenames = ['latest.pth'] if epoch % 50 == 0: filenames.append(f'{epoch:03}.pth') for name in filenames: torch.save(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval" / name) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(): # train on the GPU or on the CPU, if a GPU is not available device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') # our dataset has two classes only - background and person num_classes = 2 # use our dataset and defined transformations dataset = PennFudanDataset('PennFudanPed', get_transform(train=True)) dataset_test = PennFudanDataset('PennFudanPed', get_transform(train=False)) # split the dataset in train and test set indices = torch.randperm(len(dataset)).tolist() dataset = torch.utils.data.Subset(dataset, indices[:-50]) dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:]) # define training and validation data loaders data_loader = torch.utils.data.DataLoader(dataset, batch_size=2, shuffle=True, num_workers=4, collate_fn=utils.collate_fn) data_loader_test = torch.utils.data.DataLoader(dataset_test, batch_size=1, shuffle=False, num_workers=4, collate_fn=utils.collate_fn) # get the model using our helper function model = get_model_instance_segmentation(num_classes) # move model to the right device model.to(device) # construct an optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) # and a learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1) # let's train it for 10 epochs num_epochs = 10 for epoch in range(num_epochs): # train for one epoch, printing every 10 iterations train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10) # update the learning rate lr_scheduler.step() # evaluate on the test dataset evaluate(model, data_loader_test, device=device) print("That's it!")
def main(config): device = torch.device(config.device) print(f'Initializing Device: {device}') seed = config.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) model, criterion = caption.build_model(config) model.load_state_dict(torch.load("pretrained_wts/my_model.pth")) model.to(device) print("Model Loaded") n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print(f"Number of params: {n_parameters}") param_dicts = [ { "params": [ p for n, p in model.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model.named_parameters() if "backbone" in n and p.requires_grad ], "lr": config.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=config.lr, weight_decay=config.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, config.lr_drop) dataset_train = coco.build_dataset(config, mode='training') dataset_val = coco.build_dataset(config, mode='validation') print(f"Train: {len(dataset_train)}") print(f"Valid: {len(dataset_val)}") sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, config.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, num_workers=config.num_workers) data_loader_val = DataLoader(dataset_val, config.batch_size, sampler=sampler_val, drop_last=False, num_workers=config.num_workers) if os.path.exists(config.checkpoint + "19"): print("Loading Checkpoint...19") checkpoint = torch.load(config.checkpoint + "19", map_location='cpu') model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) config.start_epoch = checkpoint['epoch'] + 1 print("Loaded Checkpoint:", config.checkpoint + "19") print("Start Training..") # epoch starts from 0 for epoch in range(config.start_epoch, config.epochs): print(f"Epoch: {epoch}") epoch_loss = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, config.clip_max_norm) lr_scheduler.step() print(f"Training Loss: {epoch_loss}") torch.save( { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, # epoch = 0 in checkpoint means epoch num 1 }, config.checkpoint + str(epoch + 1) ) # saved checkpoint checkpoint.pth1 means chkpt for epoch num 1 validation_loss = evaluate(model, criterion, data_loader_val, device) print(f"Validation Loss: {validation_loss}") print()
def main(): args = get_args() if args.output_dir: utils.mkdir(args.output_dir) utils.init_distributed_mode(args) # Data loading print("Loading data") dataset, num_classes = get_dataset(args.dataset, "train", get_transform(train=True)) dataset_test, _ = get_dataset(args.dataset, "val", get_transform(train=False)) print("Creating data loaders") if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( dataset) test_sampler = torch.utils.data.distributed.DistributedSampler( dataset_test) else: train_sampler = torch.utils.data.RandomSampler(dataset) test_sampler = torch.utils.data.SequentialSampler(dataset_test) if args.aspect_ratio_group_factor >= 0: group_ids = create_aspect_ratio_groups( dataset, k=args.aspect_ratio_group_factor) train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.b) else: train_batch_sampler = torch.utils.data.BatchSampler(train_sampler, args.b, drop_last=True) data_loader = torch.utils.data.DataLoader( dataset, batch_sampler=train_batch_sampler, num_workers=args.workers, collate_fn=utils.collate_fn) data_loader_test = torch.utils.data.DataLoader(dataset_test, batch_size=args.b, sampler=test_sampler, num_workers=args.workers, collate_fn=utils.collate_fn) # Model creating print("Creating model") # model = models.__dict__[args.model](num_classes=num_classes, pretrained=args.pretrained) model = torchvision.models.detection.__dict__[args.model]( num_classes=num_classes, pretrained=args.pretrained) device = torch.device(args.device) model.to(device) # Distribute model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module # Parallel if args.parallel: print('Training parallel') model = torch.nn.DataParallel(model, device_ids=[args.gpu]).cuda() model_without_ddp = model.module # Optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=args.lr_steps, gamma=args.lr_gamma) # Resume training if args.resume: print('Resume training') checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) if args.test_only: evaluate(model, data_loader_test, device=device) return # Training print('Start training') start_time = time.time() for epoch in range(args.epochs): train_one_epoch(model, optimizer, data_loader, device, epoch, args.print_freq) lr_scheduler.step() if args.output_dir: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'args': args }, os.path.join(args.output_dir, 'model_{}.pth'.format(epoch))) # evaluate after every epoch evaluate(model, data_loader_test, device=device) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(): # train on the GPU or on the CPU, if a GPU is not available device = torch.device( 'cuda:0') if torch.cuda.is_available() else torch.device('cpu') # our dataset has two classes only - background and person num_classes = 2 dataset_all = CowDataset(get_transform(train=True)) # split the dataset in train and test set indices = torch.randperm(len(dataset_all)).tolist() split_index = int(len(dataset_all) * 0.8) dataset = torch.utils.data.Subset(dataset_all, indices[:split_index]) dataset_test = torch.utils.data.Subset(dataset_all, indices[split_index:]) # define training and validation data loaders data_loader = torch.utils.data.DataLoader(dataset, batch_size=6, shuffle=True, num_workers=4, collate_fn=utils.collate_fn) data_loader_test = torch.utils.data.DataLoader(dataset_test, batch_size=1, shuffle=False, num_workers=4, collate_fn=utils.collate_fn) # get the model using our helper function model = get_model_instance_segmentation(num_classes) # move model to the right device model.to(device) # construct an optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) # and a learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1) # let's train it for 10 epochs num_epochs = 20 for epoch in range(num_epochs): # qrain for one epoch, printing every 10 iterations metric_logger = train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=5) # update the learning rate lr_scheduler.step() # evaluate on the test dataset coco_evaluator = evaluate(model, data_loader_test, device=device) torch.save(model.state_dict(), 'mask_rcnn_model_%d.pth' % epoch) print("That's it!")
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) dataset_train = build_dataset(image_set='train', args=args) dataset_val = build_dataset(image_set='val', args=args) dataset_test = build_dataset(image_set='test', args=args) if args.distributed: if args.cache_mode: sampler_train = samplers.NodeDistributedSampler(dataset_train) sampler_val = samplers.NodeDistributedSampler(dataset_val, shuffle=False) else: sampler_train = samplers.DistributedSampler(dataset_train) sampler_val = samplers.DistributedSampler(dataset_val, shuffle=False) sampler_test = samplers.DistributedSampler(dataset_test, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) sampler_test = torch.utils.data.SequentialSampler(dataset_test) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True) data_loader_test = DataLoader(dataset_test, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True) # lr_backbone_names = ["backbone.0", "backbone.neck", "input_proj", "transformer.encoder"] def match_name_keywords(n, name_keywords): out = False for b in name_keywords: if b in n: out = True break return out for n, p in model_without_ddp.named_parameters(): print(n) param_dicts = [{ "params": [ p for n, p in model_without_ddp.named_parameters() if not match_name_keywords(n, args.lr_backbone_names) and not match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad ], "lr": args.lr, }, { "params": [ p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_backbone_names) and p.requires_grad ], "lr": args.lr_backbone, }, { "params": [ p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad ], "lr": args.lr * args.lr_linear_proj_mult, }] if args.sgd: optimizer = torch.optim.SGD(param_dicts, lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) else: optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') del checkpoint["model"]["transformer.decoder.class_embed.0.weight"] del checkpoint["model"]["transformer.decoder.class_embed.0.bias"] del checkpoint["model"]["transformer.decoder.class_embed.1.weight"] del checkpoint["model"]["transformer.decoder.class_embed.1.bias"] del checkpoint["model"]["transformer.decoder.class_embed.2.weight"] del checkpoint["model"]["transformer.decoder.class_embed.2.bias"] del checkpoint["model"]["transformer.decoder.class_embed.3.weight"] del checkpoint["model"]["transformer.decoder.class_embed.3.bias"] del checkpoint["model"]["transformer.decoder.class_embed.4.weight"] del checkpoint["model"]["transformer.decoder.class_embed.4.bias"] del checkpoint["model"]["transformer.decoder.class_embed.5.weight"] del checkpoint["model"]["transformer.decoder.class_embed.5.bias"] del checkpoint["model"]["transformer.decoder.class_embed.6.weight"] del checkpoint["model"]["transformer.decoder.class_embed.6.bias"] del checkpoint["model"]["class_embed.0.weight"] del checkpoint["model"]["class_embed.0.bias"] del checkpoint["model"]["class_embed.1.weight"] del checkpoint["model"]["class_embed.1.bias"] del checkpoint["model"]["class_embed.2.weight"] del checkpoint["model"]["class_embed.2.bias"] del checkpoint["model"]["class_embed.3.weight"] del checkpoint["model"]["class_embed.3.bias"] del checkpoint["model"]["class_embed.4.weight"] del checkpoint["model"]["class_embed.4.bias"] del checkpoint["model"]["class_embed.5.weight"] del checkpoint["model"]["class_embed.5.bias"] del checkpoint["model"]["class_embed.6.weight"] del checkpoint["model"]["class_embed.6.bias"] missing_keys, unexpected_keys = model_without_ddp.load_state_dict( checkpoint['model'], strict=False) unexpected_keys = [ k for k in unexpected_keys if not (k.endswith('total_params') or k.endswith('total_ops')) ] # if len(missing_keys) > 0: # print('Missing Keys: {}'.format(missing_keys)) # if len(unexpected_keys) > 0: # print('Unexpected Keys: {}'.format(unexpected_keys)) # if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: # import copy # p_groups = copy.deepcopy(optimizer.param_groups) # optimizer.load_state_dict(checkpoint['optimizer']) # for pg, pg_old in zip(optimizer.param_groups, p_groups): # pg['lr'] = pg_old['lr'] # pg['initial_lr'] = pg_old['initial_lr'] # #print(optimizer.param_groups) # lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) # # todo: this is a hack for doing experiment that resume from checkpoint and also modify lr scheduler (e.g., decrease lr in advance). # args.override_resumed_lr_drop = True # if args.override_resumed_lr_drop: # print('Warning: (hack) args.override_resumed_lr_drop is set to True, so args.lr_drop would override lr_drop in resumed lr_scheduler.') # lr_scheduler.step_size = args.lr_drop # lr_scheduler.base_lrs = list(map(lambda group: group['initial_lr'], optimizer.param_groups)) # lr_scheduler.step(lr_scheduler.last_epoch) # args.start_epoch = checkpoint['epoch'] + 1 # # check the resumed model if not args.eval: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.eval: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") return if args.test: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_test, base_ds, device, args.output_dir) if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 5 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 5 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # for evaluation logs if coco_evaluator is not None: (output_dir / 'eval').mkdir(exist_ok=True) if "bbox" in coco_evaluator.coco_eval: filenames = ['latest.pth'] if epoch % 50 == 0: filenames.append(f'{epoch:03}.pth') for name in filenames: torch.save(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval" / name) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print(args) # device = torch.device(args.device) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("Loading data") # dataset, num_classes = get_dataset(args.dataset, "train", get_transform(train=True), args.data_path) # dataset_test, _ = get_dataset(args.dataset, "val", get_transform(train=False), args.data_path) dataset = CustomDataset(args.img_data_path, args.anno_data_path, transforms=get_transform(train=True)) dataset_size = len(dataset) indices = list(range(dataset_size)) split = int(np.floor(.2 * dataset_size)) random_seed = 42 np.random.seed(random_seed) np.random.shuffle(indices) train_indices, val_indices = indices[split:], indices[:split] train_sampler = SubsetRandomSampler(train_indices) test_sampler = SubsetRandomSampler(val_indices) print("Creating data loaders") # if args.distributed: # train_sampler = torch.utils.data.distributed.DistributedSampler(dataset_train) # test_sampler = torch.utils.data.distributed.DistributedSampler(dataset_test) # else: # train_sampler = torch.utils.data.RandomSampler(dataset_train) # test_sampler = torch.utils.data.SequentialSampler(dataset_test) # # if args.aspect_ratio_group_factor >= 0: # group_ids = create_aspect_ratio_groups(dataset_train, k=args.aspect_ratio_group_factor) # train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size) # else: # train_batch_sampler = torch.utils.data.BatchSampler( # train_sampler, args.batch_size, drop_last=True) data_loader = torch.utils.data.DataLoader(dataset, batch_size=2, sampler=train_sampler, num_workers=args.workers, collate_fn=utils.collate_fn) data_loader_test = torch.utils.data.DataLoader(dataset, batch_size=1, sampler=test_sampler, num_workers=args.workers, collate_fn=utils.collate_fn) print("Creating model") num_classes = 2 # 1 class (Car) + background model = torchvision.models.detection.__dict__[args.model]( num_classes=num_classes, pretrained=args.pretrained) model.to(device) model_without_ddp = model # if args.distributed: # model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) # model_without_ddp = model.module params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=args.lr_steps, gamma=args.lr_gamma) if args.resume: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.test_only: evaluate(model, data_loader_test, device=device) return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): # if args.distributed: # train_sampler.set_epoch(epoch) train_one_epoch(model, optimizer, data_loader, device, epoch, args.print_freq) lr_scheduler.step() if args.output_dir: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'args': args, 'epoch': epoch }, os.path.join(args.output_dir, 'model_{}.pth'.format(epoch))) # evaluate after every epoch evaluate(model, data_loader_test, device=device) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) print(args) device = torch.device(args.device) # Fix the seed for reproducibility. seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion = build_model(args) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], find_unused_parameters=True) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ {"params": [p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad]}, { "params": [p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) dataset_train = build_dataset(image_set='train', args=args) if args.distributed: sampler_train = DistributedSampler(dataset_train) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) # Load from pretrained DETR model. assert args.num_queries == 100, args.num_queries assert args.enc_layers == 6 and args.dec_layers == 6 assert args.backbone in ['resnet50', 'resnet101', 'swin'], args.backbone if args.backbone == 'resnet50': pretrain_model = './data/detr_coco/detr-r50-e632da11.pth' elif args.backbone == 'resnet101': pretrain_model = './data/detr_coco/detr-r101-2c7b67e5.pth' else: pretrain_model = None if pretrain_model is not None: pretrain_dict = torch.load(pretrain_model, map_location='cpu')['model'] my_model_dict = model_without_ddp.state_dict() pretrain_dict = {k: v for k, v in pretrain_dict.items() if k in my_model_dict} my_model_dict.update(pretrain_dict) model_without_ddp.load_state_dict(my_model_dict) output_dir = Path(args.output_dir) if args.resume: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch( model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 10 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') if (epoch + 1) > args.lr_drop and (epoch + 1) % 10 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master({ 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters} if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(): global hparams, args # train on the GPU or on the CPU, if a GPU is not available device = torch.device( hparams.device) if torch.cuda.is_available() else torch.device('cpu') if (args.dataset == 'malaria'): hparams.dataset_root = 'malaria' hparams.exp_name = f'maskrcnn-{hparams.dataset_root}' dataset = MalariaDataset(hparams.train_dir, hparams.train_csv, get_transform(train=True)) dataset_test = MalariaDataset(hparams.test_dir, hparams.test_csv, get_transform(False)) else: dataset = PennFudanDataset('PennFudanPed', get_transform(train=True)) dataset_test = PennFudanDataset('PennFudanPed', get_transform(train=False)) hparams.num_classes = 2 writer = SummaryWriter(f'runs/{hparams.exp_name}_{hparams.timestamp}') # define training and validation data loaders data_loader = torch.utils.data.DataLoader(dataset, batch_size=2, shuffle=True, num_workers=4, collate_fn=utils.collate_fn) data_loader_test = torch.utils.data.DataLoader(dataset_test, batch_size=1, shuffle=False, num_workers=4, collate_fn=utils.collate_fn) # get the model using our helper function model = get_model_instance_segmentation(hparams) # move model to the right device model.to(device) model_without_ddp = model if hparams.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[hparams.device_ids]) model_without_ddp = model.module # construct an optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) # and a learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1) # let's train it for 10 epochs num_epochs = hparams.num_epochs for epoch in range(num_epochs): # train for one epoch, printing every 10 iterations train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10, writer=writer) # update the learning rate lr_scheduler.step() # evaluate on the test dataset evaluate(model, data_loader_test, device=device) torch.save( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'epoch': epoch }, os.path.join(hparams.model_dir, 'model_{}.pth'.format(epoch))) print("That's it!")