Exemplo n.º 1
0
def train(output_model_dir: str, input_model_path: Optional[str] = None, tb_path: str = None,
          nuscenes_version: str = 'v1.0-mini', data_path: str = "data/v1.0-mini", n_scenes: int = None,
          learning_rate: int = 1e-4, n_dumps_per_epoch: int = 10, n_loader_workers: int = 4, batch_size: int = 12,
          n_epochs: int = 50, device_id: List[int] = None) -> None:
    """
    Train model, log training statistics if tb_path is specified.
    :param output_model_dir: path to directory to save model weights to
    :param input_model_path: path to model weights. If None, create new model
    :param tb_path: name of the folder for tensorboard data to be store in
    :param nuscenes_version: version of the dataset
    :param data_path: relative path to data folder
    :param n_scenes: number of scenes in dataset
    :param learning_rate: learning rate for Adam
    :param n_dumps_per_epoch: how many times per epoch to dump images to tensorboard (not implemented yet)
    :param n_loader_workers: number of CPU workers for data loader processing
    :param batch_size: batch size
    :param n_epochs: total number of epochs to train the model
    :param device_id: list of gpu device ids to use, e.g [0, 1]
    """
    # create path for model save
    os.makedirs(output_model_dir, exist_ok=True)

    # set up computing device for pytorch
    if torch.cuda.is_available():
        if device_id is None:
            device_id = [0]
        if max(device_id) < torch.cuda.device_count():
            # device_id/s all exist on machine,
            # device is set as a root device
            device = torch.device(f'cuda:{device_id[0]}')
        else:
            # device_id is out of range, setting to defaults cuda:0
            print('Warning: specified number of gpu device_id is larger than available, using cuda:0.')
            device = torch.device('cuda:0')
        print('Using device: GPU\n')
    else:
        device = torch.device('cpu')
        print('Using device: CPU\n')

    date = datetime.datetime.now().strftime('%b-%d-%Y-%H:%M:%S')

    # set up tensorboard writer
    if tb_path is not None:
        train_writer = SummaryWriter(log_dir=f'{tb_path}/{date}/train')
        val_writer = SummaryWriter(log_dir=f'{tb_path}/{date}/val')
        print(f'Logging tensorboard data to directory: {tb_path}/{date}\n')
    else:
        train_writer, val_writer = None, None
        print(f'No tensorboard logging will be performed\n')

    # set up dataset and model
    nuscenes = create_nuscenes(data_path, nuscenes_version)
    train_dataset = NuscenesBEVDataset(nuscenes=nuscenes, n_scenes=n_scenes, mode='train')
    val_dataset = NuscenesBEVDataset(nuscenes=nuscenes, n_scenes=n_scenes, mode='val')
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=n_loader_workers,
                              collate_fn=frames_bboxes_collate_fn, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=n_loader_workers,
                            collate_fn=frames_bboxes_collate_fn, pin_memory=True)
    print('Loaders are ready.',
          f'Number of batches in train loader: {len(train_loader)}'
          f'Number of batches in validation loader: {len(val_loader)}', sep='\n')

    frame_depth, frame_width, frame_length = train_dataset.grid_size
    model = Detector(img_depth=frame_depth)
    if input_model_path is not None:
        model.load_state_dict(torch.load(input_model_path, map_location="cpu"))
    model = model.to(device)
    criterion = DetectionLoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)
    scheduler = StepLR(optimizer, gamma=0.5, step_size=50)  # TODO: adjust step_size empirically
    detector_out_shape = (batch_size, model.out_channels, frame_width // (2 ** model.n_pools),
                          frame_length // (2 ** model.n_pools))
    gt_former = GroundTruthFormer((frame_width, frame_length), detector_out_shape, device=device)

    if len(device_id) > 1 and max(device_id) < torch.cuda.device_count():
        # if more than one device_id specified, use DataParallel
        model = nn.DataParallel(model, device_ids=device_id)
    model = model.to(device)

    best_val_score = float('-inf')
    for epoch in trange(n_epochs, desc="Epoch"):
        run_epoch(model, train_loader, criterion, gt_former, epoch, mode='train',
                  writer=train_writer, optimizer=optimizer, device=device)
        scheduler.step()
        val_loss, val_score = run_epoch(model, val_loader, criterion, gt_former, epoch,
                                        mode='val', train_loader_size=len(train_loader), writer=val_writer,
                                        device=device)
        # saving model weights in case validation loss AND score are better
        if val_score > best_val_score:
            best_val_score = val_score
            torch.save(model.state_dict(), f'{output_model_dir}/{date}.pth')
            print('\nModel checkpoint is saved.', f'loss: {val_loss:.3f}, score: {val_score:.3f}', sep='\n')
Exemplo n.º 2
0
def main(run_id, pretrained, data_files, model_params, training_params,
         device):
    best_acc1 = 0
    batch_size = training_params['batch_size']
    test_batch_size = training_params['test_batch_size']
    epochs = training_params['epochs']
    start_epoch = training_params['start_epoch']
    n_warmup_steps = training_params['n_warmup_steps']
    log_interval = training_params['log_interval']

    # model is trained for binary classification (for datalaoder)
    if model_params['NUM_SPOOF_CLASS'] == 2:
        binary_class = True
    else:
        binary_class = False

    kwargs = {
        'num_workers': 2,
        'pin_memory': True
    } if device == torch.device('cuda') else {}

    # create model
    model = Detector(**model_params).to(device)
    num_model_params = sum(p.numel() for p in model.parameters()
                           if p.requires_grad)
    print('===> Model total parameter: {}'.format(num_model_params))

    # Wrap model for multi-GPUs, if necessary
    if device == torch.device('cuda') and torch.cuda.device_count() > 1:
        print('multi-gpu')
        model = nn.DataParallel(model).cuda()

    # define loss function (criterion) and optimizer
    optim = optimizer.ScheduledOptim(
        torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()),
                         betas=(0.9, 0.98),
                         eps=1e-09,
                         weight_decay=1e-4,
                         lr=3e-4,
                         amsgrad=True), training_params['n_warmup_steps'])

    # optionally resume from a checkpoint
    if pretrained:
        if os.path.isfile(pretrained):
            print("===> loading checkpoint '{}'".format(pretrained))
            checkpoint = torch.load(pretrained)
            start_epoch = checkpoint['epoch']
            best_acc1 = checkpoint['best_acc1']
            model.load_state_dict(checkpoint['state_dict'])
            optim.load_state_dict(checkpoint['optimizer'])
            print("===> loaded checkpoint '{}' (epoch {})".format(
                pretrained, checkpoint['epoch']))
        else:
            print("===> no checkpoint found at '{}'".format(pretrained))

    # Data loading code
    train_data = SpoofDatsetSystemID(data_files['train_scp'],
                                     data_files['train_utt2index'],
                                     binary_class)
    val_data = SpoofDatsetSystemID(data_files['dev_scp'],
                                   data_files['dev_utt2index'], binary_class)

    train_loader = torch.utils.data.DataLoader(train_data,
                                               batch_size=batch_size,
                                               shuffle=True,
                                               **kwargs)
    val_loader = torch.utils.data.DataLoader(val_data,
                                             batch_size=test_batch_size,
                                             shuffle=True,
                                             **kwargs)

    best_epoch = 0
    early_stopping, max_patience = 0, 100  # for early stopping
    os.makedirs("model_snapshots/" + run_id, exist_ok=True)
    for epoch in range(start_epoch, start_epoch + epochs):

        trainer.train(train_loader, model, optim, epoch, device, log_interval)
        acc1 = validate.validate(val_loader, data_files['dev_utt2systemID'],
                                 model, device, log_interval)

        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)

        # adjust learning rate + early stopping
        if is_best:
            early_stopping = 0
            best_epoch = epoch + 1
        else:
            early_stopping += 1
            if epoch - best_epoch > 2:
                optim.increase_delta()
                best_epoch = epoch + 1
        if early_stopping == max_patience:
            break

        # save model
        optimizer.save_checkpoint(
            {
                'epoch': epoch,
                'state_dict': model.state_dict(),
                'best_acc1': best_acc1,
                'optimizer': optim.state_dict(),
            }, is_best, "model_snapshots/" + str(run_id),
            str(epoch) + ('_%.3f' % acc1) + ".pth.tar")
    val_cache = 'cache/cache_val_{}_{}.pth'.format('cos' if args.distance == 'cosine' else 'euc',
                                       'med' if 'medoids' in args.centroids else 'centr')
    val_paths, val_data = precompute_embeddings(features_state, val_data, model, args, return_paths=True, cache=val_cache)

    train_loader = DataLoader(train_data, sampler=sampler, pin_memory=True, batch_size=args.batch_size)
    val_loader = DataLoader(val_data, shuffle=False, pin_memory=True, batch_size=args.batch_size)

    # Train loop
    best = torch.zeros(3)
    progress = trange(1, args.epochs + 1)
    for epoch in progress:
        progress.set_description('TRAIN')
        train(train_loader, detector, optimizer, args)
        progress.set_description('EVAL')
        val_metrics_names, val_metrics = evaluate(val_loader, val_paths, detector, args)

        val_metrics_dict = dict(zip(val_metrics_names, val_metrics.tolist()))
        log = log.append(pd.DataFrame(val_metrics_dict, index=[pd.Timestamp('now')]))
        log.to_csv(log_file)

        if best[2] < val_metrics[2]:  # keep best macro-AUC
            ckpt_path = os.path.join(ckpt_dir, 'best_model.pth')
            torch.save({
                'detector': detector.state_dict(),
                'optimizer': optimizer.state_dict(),
                'metrics': val_metrics_dict
            }, ckpt_path)

        best = torch.max(val_metrics, best)