def do_training(model, torch_dataset, torch_dataset_test, num_epochs, writer): data_loader = DataLoader( torch_dataset, batch_size=8, shuffle=True, collate_fn=utils.collate_fn ) data_loader_test = DataLoader( torch_dataset_test, batch_size=2, shuffle=False, collate_fn=utils.collate_fn ) device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") print("Using device {}".format(device)) model.to(device) params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1) for epoch in range(num_epochs): log = train_one_epoch( model, optimizer, data_loader, device, epoch, print_freq=10 ) writer.add_scalar("Train/Learning rate", log.meters["lr"].value, epoch) writer.add_scalar("Train/Loss", log.meters["loss"].value, epoch) lr_scheduler.step() evaluate(model, data_loader_test, device)
def main(): """ For data_dir, we assume hierarchy (data_dir/Normal, data_dir/Segmentation) and all images include package model_out_path: the path/name of the model file saved """ data_dir = sys.argv[1] model_out_path = sys.argv[2] num_epochs = int(sys.argv[3]) if len(sys.argv) > 3 else 6 plot_train_loss = bool(distutils.util.strtobool(sys.argv[4])) if len(sys.argv) > 4 else False plot_valid_loss = bool(distutils.util.strtobool(sys.argv[5])) if len(sys.argv) > 5 else False # using dataset and defined transformations dataset = PackageDataset(data_dir, get_transform(train=True)) dataset_test = PackageDataset(data_dir, get_transform(train=False)) # split dataset into train and test set torch.manual_seed(1) indices = torch.randperm(len(dataset)).tolist() dataset = torch.utils.data.Subset(dataset, indices[:-20]) dataset_test = torch.utils.data.Subset(dataset_test, indices[-20:]) # define training and validation data loaders data_loader = torch.utils.data.DataLoader( dataset, batch_size=2, shuffle=True, num_workers=4, collate_fn=references.utils.collate_fn) data_loader_test = torch.utils.data.DataLoader( dataset_test, batch_size=1, shuffle=False, num_workers=4, collate_fn=references.utils.collate_fn) device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') # only background and package for this dataset num_classes = 2 # get model my_model = get_model_instance_segmentation(num_classes) # move model to correct device my_model.to(device) # construct optimizer params = [p for p in my_model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) # lr_scheduler that decreases learning rate by 10x every 3 epochs lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1) # TRAINING all_train_loss = [] all_valid_loss = [] for epoch in range(num_epochs): print(f"TRAIN") # train for one epoch and printing every 10 iterations # train_one_epoch(my_model, optimizer, data_loader, device, epoch, print_freq=10) _, all_loss_epoch = train_one_epoch(my_model, optimizer, data_loader, device, epoch, print_freq=60) if plot_train_loss: num_recorded = len(all_loss_epoch) avg_loss_epoch = sum(all_loss_epoch) / num_recorded all_train_loss.extend(all_loss_epoch) print(f"AVERAGE LOSS EPOCH {epoch}: {avg_loss_epoch}") # Validation loss if plot_valid_loss: with torch.no_grad(): for images, targets in data_loader_test: images = list(image.to(device) for image in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] loss_dict = my_model(images, targets) loss_dict_reduced = references.utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() all_valid_loss.append(loss_value) # update learning rate lr_scheduler.step() print(f"EVALUATE") # evaluate on test dataset evaluate(my_model, data_loader_test, device=device) if plot_train_loss: train_x = [a * num_epochs / len(all_train_loss) for a in range(len(all_train_loss))] plt.plot(train_x, all_train_loss, label="training loss") if plot_valid_loss: valid_x = [a * len(all_train_loss) / len(all_valid_loss) for a in range(len(all_valid_loss))] plt.plot(valid_x, all_valid_loss, label="validation loss") if plot_train_loss or plot_valid_loss: plt.xlabel("Epochs") plt.ylabel("Loss") plt.legend() plt.title("Loss vs. Epochs") plt.xticks([a for a in range(num_epochs + 1)]) # plt.savefig('./eighth_plot.png') plt.savefig(f"{model_out_path}-loss.png") torch.save(my_model, model_out_path)
# construct an optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) # and a learning rete scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1) # let's train it for 10 epochs num_epoch = 10 for epoch in range(num_epoch): # train for one epoch, printing every 10 iterations train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10) # update the learning rate lr_scheduler.step() # evaluate on the test dataset evaluate(model, data_loader_test, device=device) # Saving Model for Inference torch.save(model.state_dict(), "dict.pth") print("That's it!")
def main(args): experiment_name = '' output_base_url = osp.join( project_root, 'weights', '{}_{}'.format(experiment_name, datetime.now().strftime("%d-%m-%Y-%H-%M"))) # get model print('loading model...') model = get_resnet50_pretrained_model() # create datasets print('loading train dataset...') train_dataset = PoseDataset([ osp.join(project_root, 'data/vzf/freestyle/freestyle_1'), osp.join(project_root, 'data/vzf/freestyle/freestyle_2'), osp.join(project_root, 'data/vzf/freestyle/freestyle_3'), osp.join(project_root, 'data/vzf/freestyle/freestyle_4') ], train=True) print('train dataset size: {}'.format(len(train_dataset))) print('loading val dataset...') val_dataset = PoseDataset([ osp.join(project_root, 'data/vzf/freestyle/freestyle_5'), osp.join(project_root, 'data/vzf/freestyle/freestyle_6') ], train=False) print('test dataset size: {}'.format(len(val_dataset))) # split the dataset in train and test set #indices = torch.randperm(len(dataset)).tolist() #dataset_train = torch.utils.data.Subset(dataset, indices[:-20]) #dataset_test = torch.utils.data.Subset(dataset, indices[-20:]) # create dataloaders print('creating dataloaders...') data_loader = DataLoader(train_dataset, batch_size=10, shuffle=True, num_workers=4, collate_fn=collate_fn) data_loader_test = DataLoader(val_dataset, batch_size=10, shuffle=True, num_workers=4, collate_fn=collate_fn) # get device device = select_best_gpu( min_mem=6100) if torch.cuda.is_available() else torch.device('cpu') print('selected device: {}'.format(device)) # only set roi_heads trainable train_only_roi_heads(model) # grab trainable parameters params = [p for p in model.parameters() if p.requires_grad] # create optimizer and scheduler optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1) #train print('loading model onto device') model.to(device) # training mode model.train() start = time.time() # initialize to very large value min_box_loss = 10000 min_kp_loss = 10000 num_epochs = 100 get_validation_error(model, data_loader_test, device) for epoch in tqdm(range(0, num_epochs)): train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10) if epoch % 5 == 0 or epoch == num_epochs - 1: lr_scheduler.step() # validation box_loss, kp_loss = get_validation_error(model, data_loader_test, device) print('box_loss: {}, kp_loss: {}'.format(box_loss, kp_loss)) if kp_loss < min_kp_loss: print('improved val score, saving state dict...') # lower validation score found min_kp_loss = kp_loss min_box_loss = box_loss temp_state_dict = copy.deepcopy(model.state_dict()) else: print( 'loading previous state dict (current best: {})...'.format( min_kp_loss)) model.load_state_dict(temp_state_dict) # every 10 epochs use coco to evaluate if epoch % 10 == 0 or epoch == num_epochs - 1: print('COCO EVAL EPOCH {}'.format(epoch)) evaluate(model, data_loader_test, device=device) evaluator = evaluate(model, data_loader_test, device=device) torch.save( model.state_dict(), output_base_url + '_epoch{}-{}_min_val_loss_{}.wth'.format( epoch, num_epochs, min_kp_loss)) end = time.time() duration_min = int((end - start) / 60) # post result to slack channel slack_message( "Done Training, took {}min \n box loss: {}, KP loss: {}".format( duration_min, min_box_loss, min_kp_loss), channel='#training')
# move model to the right device model.to(device) # construct an optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.00001, momentum=0.9, weight_decay=0.0005) # and a learning rate scheduler which decreases the learning rate by 10x every 3 epochs lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1) for epoch in range(1, NUM_EPOCHS + 1): train_one_epoch(model, optimizer, train_loader, device, epoch, print_freq=200) # update the learning rate lr_scheduler.step() # evaluate on the test dataset if epoch % 2 == 0: # evaluate_and_write_result_files(model, data_loader_test) torch.save(model.state_dict(), os.path.join("model", f"model_epoch_{epoch}.model"))
def main(args): utils.init_distributed_mode(args) print(args) device = torch.device(args.device) # Data loading code print("Loading data") dataset = BirdDataset(name=args.dataset, transforms=get_transform(True), train=True, small_set=args.small_set, only_instance=args.only_instance) dataset_test = BirdDataset(name=args.dataset, transforms=get_transform(False), train=False, small_set=args.small_set, only_instance=args.only_instance) print("Creating data loaders") if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(dataset) test_sampler = torch.utils.data.distributed.DistributedSampler(dataset_test) else: train_sampler = torch.utils.data.RandomSampler(dataset) test_sampler = torch.utils.data.SequentialSampler(dataset_test) if args.aspect_ratio_group_factor >= 0: group_ids = create_aspect_ratio_groups(dataset, k=args.aspect_ratio_group_factor) train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size) else: train_batch_sampler = torch.utils.data.BatchSampler( train_sampler, args.batch_size, drop_last=True) data_loader = torch.utils.data.DataLoader( dataset, batch_sampler=train_batch_sampler, num_workers=args.workers, collate_fn=utils.collate_fn) data_loader_test = torch.utils.data.DataLoader( dataset_test, batch_size=1, sampler=test_sampler, num_workers=args.workers, collate_fn=utils.collate_fn) print("Creating model") if args.model == 'normal': print('normal model') model = get_model_attention(num_classes=args.num_classes, use_focal_loss=args.use_focal_loss, focal_gamma=args.focal_gamma, use_attention=False) elif args.model == 'attention': print('attention model') model = get_model_attention(num_classes=args.num_classes, attention_head_output_channels=args.num_parts, use_focal_loss=args.use_focal_loss, focal_gamma=args.focal_gamma, use_attention=True) elif args.model == 'attention_transformer': print('attention transformer model') model = get_model_attention(transformer=True, num_classes=args.num_classes, attention_head_output_channels=args.num_parts, use_focal_loss=args.use_focal_loss, focal_gamma=args.focal_gamma, use_attention=True) else: raise Exception("'model' must be 'normal' or 'attention' or 'attention_transformer'") model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], find_unused_parameters=True) model_without_ddp = model.module params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD( params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=args.lr_steps, gamma=args.lr_gamma) if args.resume: print("load resume") checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model'], strict=False) if not args.ft: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.test_only: evaluator = evaluate(model, data_loader_test, device=device, epoch=0, name=Path(args.output_dir).name, do_record=True) return evaluator print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) train_one_epoch(model, optimizer, data_loader, device, epoch, args.print_freq, name=Path(args.output_dir).name, use_aug=args.use_aug) lr_scheduler.step() if args.output_dir: utils.save_on_master({ 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'args': args, 'epoch': epoch}, os.path.join(args.output_dir, 'model_{}.pth'.format(epoch))) # evaluate after every epoch if not args.no_eval: do_record = False if epoch == args.epochs - 1: do_record = True evaluate(model, data_loader_test, epoch=epoch, device=device, name=Path(args.output_dir).name, do_record=do_record) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def do(config): # use our dataset and defined transformations if config.data_type == 'PennFudanPed': dataset = PennFudanDataset(config.root, get_transform(train=True)) dataset_test = PennFudanDataset(config.root, get_transform(train=False)) else: dataset = UserDataset(config.root, get_transform(train=True)) dataset_test = UserDataset(config.root, get_transform(train=False)) # split the dataset in train and test set torch.manual_seed(1) indices = torch.randperm(len(dataset)).tolist() dataset = torch.utils.data.Subset(dataset, indices[:-50]) dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:]) # define training and validation data loaders data_loader = torch.utils.data.DataLoader(dataset, batch_size=config.batch_size, shuffle=True, collate_fn=utils.collate_fn) data_loader_test = torch.utils.data.DataLoader( dataset_test, batch_size=config.batch_size, shuffle=False, collate_fn=utils.collate_fn) device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') # our dataset has two classes only - background and person # get the model using our helper function model = get_instance_segmentation_model(config.num_classes) # move model to the right device model.to(device) # construct an optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=config.lr, momentum=config.momentum, weight_decay=config.weight_decay) # and a learning rate scheduler which decreases the learning rate by # 10x every 3 epochs lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=config.step_size, gamma=config.gamma) for epoch in range(config.num_epochs): # train for one epoch, printing every 10 iterations train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10) # update the learning rate lr_scheduler.step() # evaluate on the test dataset evaluate(model, data_loader_test, device=device) torch.save(model, os.path.join(config.save_directory, config.ckpt_name))