def train(output_model_dir: str, input_model_path: Optional[str] = None, tb_path: str = None, nuscenes_version: str = 'v1.0-mini', data_path: str = "data/v1.0-mini", n_scenes: int = None, learning_rate: int = 1e-4, n_dumps_per_epoch: int = 10, n_loader_workers: int = 4, batch_size: int = 12, n_epochs: int = 50, device_id: List[int] = None) -> None: """ Train model, log training statistics if tb_path is specified. :param output_model_dir: path to directory to save model weights to :param input_model_path: path to model weights. If None, create new model :param tb_path: name of the folder for tensorboard data to be store in :param nuscenes_version: version of the dataset :param data_path: relative path to data folder :param n_scenes: number of scenes in dataset :param learning_rate: learning rate for Adam :param n_dumps_per_epoch: how many times per epoch to dump images to tensorboard (not implemented yet) :param n_loader_workers: number of CPU workers for data loader processing :param batch_size: batch size :param n_epochs: total number of epochs to train the model :param device_id: list of gpu device ids to use, e.g [0, 1] """ # create path for model save os.makedirs(output_model_dir, exist_ok=True) # set up computing device for pytorch if torch.cuda.is_available(): if device_id is None: device_id = [0] if max(device_id) < torch.cuda.device_count(): # device_id/s all exist on machine, # device is set as a root device device = torch.device(f'cuda:{device_id[0]}') else: # device_id is out of range, setting to defaults cuda:0 print('Warning: specified number of gpu device_id is larger than available, using cuda:0.') device = torch.device('cuda:0') print('Using device: GPU\n') else: device = torch.device('cpu') print('Using device: CPU\n') date = datetime.datetime.now().strftime('%b-%d-%Y-%H:%M:%S') # set up tensorboard writer if tb_path is not None: train_writer = SummaryWriter(log_dir=f'{tb_path}/{date}/train') val_writer = SummaryWriter(log_dir=f'{tb_path}/{date}/val') print(f'Logging tensorboard data to directory: {tb_path}/{date}\n') else: train_writer, val_writer = None, None print(f'No tensorboard logging will be performed\n') # set up dataset and model nuscenes = create_nuscenes(data_path, nuscenes_version) train_dataset = NuscenesBEVDataset(nuscenes=nuscenes, n_scenes=n_scenes, mode='train') val_dataset = NuscenesBEVDataset(nuscenes=nuscenes, n_scenes=n_scenes, mode='val') train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=n_loader_workers, collate_fn=frames_bboxes_collate_fn, pin_memory=True) val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=n_loader_workers, collate_fn=frames_bboxes_collate_fn, pin_memory=True) print('Loaders are ready.', f'Number of batches in train loader: {len(train_loader)}' f'Number of batches in validation loader: {len(val_loader)}', sep='\n') frame_depth, frame_width, frame_length = train_dataset.grid_size model = Detector(img_depth=frame_depth) if input_model_path is not None: model.load_state_dict(torch.load(input_model_path, map_location="cpu")) model = model.to(device) criterion = DetectionLoss() optimizer = Adam(model.parameters(), lr=learning_rate) scheduler = StepLR(optimizer, gamma=0.5, step_size=50) # TODO: adjust step_size empirically detector_out_shape = (batch_size, model.out_channels, frame_width // (2 ** model.n_pools), frame_length // (2 ** model.n_pools)) gt_former = GroundTruthFormer((frame_width, frame_length), detector_out_shape, device=device) if len(device_id) > 1 and max(device_id) < torch.cuda.device_count(): # if more than one device_id specified, use DataParallel model = nn.DataParallel(model, device_ids=device_id) model = model.to(device) best_val_score = float('-inf') for epoch in trange(n_epochs, desc="Epoch"): run_epoch(model, train_loader, criterion, gt_former, epoch, mode='train', writer=train_writer, optimizer=optimizer, device=device) scheduler.step() val_loss, val_score = run_epoch(model, val_loader, criterion, gt_former, epoch, mode='val', train_loader_size=len(train_loader), writer=val_writer, device=device) # saving model weights in case validation loss AND score are better if val_score > best_val_score: best_val_score = val_score torch.save(model.state_dict(), f'{output_model_dir}/{date}.pth') print('\nModel checkpoint is saved.', f'loss: {val_loss:.3f}, score: {val_score:.3f}', sep='\n')
def main(run_id, pretrained, data_files, model_params, training_params, device): best_acc1 = 0 batch_size = training_params['batch_size'] test_batch_size = training_params['test_batch_size'] epochs = training_params['epochs'] start_epoch = training_params['start_epoch'] n_warmup_steps = training_params['n_warmup_steps'] log_interval = training_params['log_interval'] # model is trained for binary classification (for datalaoder) if model_params['NUM_SPOOF_CLASS'] == 2: binary_class = True else: binary_class = False kwargs = { 'num_workers': 2, 'pin_memory': True } if device == torch.device('cuda') else {} # create model model = Detector(**model_params).to(device) num_model_params = sum(p.numel() for p in model.parameters() if p.requires_grad) print('===> Model total parameter: {}'.format(num_model_params)) # Wrap model for multi-GPUs, if necessary if device == torch.device('cuda') and torch.cuda.device_count() > 1: print('multi-gpu') model = nn.DataParallel(model).cuda() # define loss function (criterion) and optimizer optim = optimizer.ScheduledOptim( torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), betas=(0.9, 0.98), eps=1e-09, weight_decay=1e-4, lr=3e-4, amsgrad=True), training_params['n_warmup_steps']) # optionally resume from a checkpoint if pretrained: if os.path.isfile(pretrained): print("===> loading checkpoint '{}'".format(pretrained)) checkpoint = torch.load(pretrained) start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] model.load_state_dict(checkpoint['state_dict']) optim.load_state_dict(checkpoint['optimizer']) print("===> loaded checkpoint '{}' (epoch {})".format( pretrained, checkpoint['epoch'])) else: print("===> no checkpoint found at '{}'".format(pretrained)) # Data loading code train_data = SpoofDatsetSystemID(data_files['train_scp'], data_files['train_utt2index'], binary_class) val_data = SpoofDatsetSystemID(data_files['dev_scp'], data_files['dev_utt2index'], binary_class) train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True, **kwargs) val_loader = torch.utils.data.DataLoader(val_data, batch_size=test_batch_size, shuffle=True, **kwargs) best_epoch = 0 early_stopping, max_patience = 0, 100 # for early stopping os.makedirs("model_snapshots/" + run_id, exist_ok=True) for epoch in range(start_epoch, start_epoch + epochs): trainer.train(train_loader, model, optim, epoch, device, log_interval) acc1 = validate.validate(val_loader, data_files['dev_utt2systemID'], model, device, log_interval) is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) # adjust learning rate + early stopping if is_best: early_stopping = 0 best_epoch = epoch + 1 else: early_stopping += 1 if epoch - best_epoch > 2: optim.increase_delta() best_epoch = epoch + 1 if early_stopping == max_patience: break # save model optimizer.save_checkpoint( { 'epoch': epoch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optim.state_dict(), }, is_best, "model_snapshots/" + str(run_id), str(epoch) + ('_%.3f' % acc1) + ".pth.tar")
val_cache = 'cache/cache_val_{}_{}.pth'.format('cos' if args.distance == 'cosine' else 'euc', 'med' if 'medoids' in args.centroids else 'centr') val_paths, val_data = precompute_embeddings(features_state, val_data, model, args, return_paths=True, cache=val_cache) train_loader = DataLoader(train_data, sampler=sampler, pin_memory=True, batch_size=args.batch_size) val_loader = DataLoader(val_data, shuffle=False, pin_memory=True, batch_size=args.batch_size) # Train loop best = torch.zeros(3) progress = trange(1, args.epochs + 1) for epoch in progress: progress.set_description('TRAIN') train(train_loader, detector, optimizer, args) progress.set_description('EVAL') val_metrics_names, val_metrics = evaluate(val_loader, val_paths, detector, args) val_metrics_dict = dict(zip(val_metrics_names, val_metrics.tolist())) log = log.append(pd.DataFrame(val_metrics_dict, index=[pd.Timestamp('now')])) log.to_csv(log_file) if best[2] < val_metrics[2]: # keep best macro-AUC ckpt_path = os.path.join(ckpt_dir, 'best_model.pth') torch.save({ 'detector': detector.state_dict(), 'optimizer': optimizer.state_dict(), 'metrics': val_metrics_dict }, ckpt_path) best = torch.max(val_metrics, best)