Пример #1
0
def set_hyperparameters(params,
                        num_classes,
                        model,
                        checkpoint,
                        dontcare_val,
                        loss_fn,
                        optimizer,
                        class_weights=None,
                        inference: str = ''):
    """
    Function to set hyperparameters based on values provided in yaml config file.
    If none provided, default functions values may be used.
    :param params: (dict) Parameters found in the yaml config file
    :param num_classes: (int) number of classes for current task
    :param model: initialized model
    :param checkpoint: (dict) state dict as loaded by model_choice.py
    :param dontcare_val: value in label to ignore during loss calculation
    :param loss_fn: loss function
    :param optimizer: optimizer function
    :param class_weights: class weights for loss function
    :param inference: (str) path to inference checkpoint (used in load_from_checkpoint())
    :return: model, criterion, optimizer, lr_scheduler, num_gpus
    """
    # set mandatory hyperparameters values with those in config file if they exist
    lr = get_key_def('learning_rate', params['training'], None)
    weight_decay = get_key_def('weight_decay', params['training'], None)
    step_size = get_key_def('step_size', params['training'], None)
    gamma = get_key_def('gamma', params['training'], None)

    class_weights = torch.tensor(class_weights) if class_weights else None

    # Loss function
    criterion = MultiClassCriterion(loss_type=loss_fn,
                                    ignore_index=dontcare_val,
                                    weight=class_weights)

    # Optimizer
    opt_fn = optimizer
    optimizer = create_optimizer(params=model.parameters(),
                                 mode=opt_fn,
                                 base_lr=lr,
                                 weight_decay=weight_decay)
    lr_scheduler = optim.lr_scheduler.StepLR(optimizer=optimizer,
                                             step_size=step_size,
                                             gamma=gamma)

    if checkpoint:
        tqdm.write(f'Loading checkpoint...')
        model, optimizer = load_from_checkpoint(checkpoint,
                                                model,
                                                optimizer=optimizer,
                                                inference=inference)

    return model, criterion, optimizer, lr_scheduler
Пример #2
0
def set_hyperparameters(params, num_classes, model, checkpoint):
    """
    Function to set hyperparameters based on values provided in yaml config file.
    Will also set model to GPU, if available.
    If none provided, default functions values may be used.
    :param params: (dict) Parameters found in the yaml config file
    :param num_classes: (int) number of classes for current task
    :param model: Model loaded from model_choice.py
    :param checkpoint: (dict) state dict as loaded by model_choice.py
    :return: model, criterion, optimizer, lr_scheduler, num_gpus
    """
    # set mandatory hyperparameters values with those in config file if they exist
    lr = get_key_def('learning_rate', params['training'], None,
                     "missing mandatory learning rate parameter")
    weight_decay = get_key_def('weight_decay', params['training'], None,
                               "missing mandatory weight decay parameter")
    step_size = get_key_def('step_size', params['training'], None,
                            "missing mandatory step size parameter")
    gamma = get_key_def('gamma', params['training'], None,
                        "missing mandatory gamma parameter")

    # optional hyperparameters. Set to None if not in config file
    class_weights = torch.tensor(
        params['training']
        ['class_weights']) if params['training']['class_weights'] else None
    if params['training']['class_weights']:
        verify_weights(num_classes, class_weights)
    ignore_index = get_key_def('ignore_index', params['training'], -1)

    # Loss function
    criterion = MultiClassCriterion(loss_type=params['training']['loss_fn'],
                                    ignore_index=ignore_index,
                                    weight=class_weights)

    # Optimizer
    opt_fn = params['training']['optimizer']
    optimizer = create_optimizer(params=model.parameters(),
                                 mode=opt_fn,
                                 base_lr=lr,
                                 weight_decay=weight_decay)
    lr_scheduler = optim.lr_scheduler.StepLR(optimizer=optimizer,
                                             step_size=step_size,
                                             gamma=gamma)

    if checkpoint:
        tqdm.write(f'Loading checkpoint...')
        model, optimizer = load_from_checkpoint(checkpoint,
                                                model,
                                                optimizer=optimizer)

    return model, criterion, optimizer, lr_scheduler
Пример #3
0
def set_hyperparameters(params, model, checkpoint):
    """
    Function to set hyperparameters based on values provided in yaml config file.
    Will also set model to GPU, if available.
    If none provided, default functions values may be used.
    :param params: (dict) Parameters found in the yaml config file
    :param model: Model loaded from model_choice.py
    :param checkpoint: (dict) state dict as loaded by model_choice.py
    :return: model, criterion, optimizer, lr_scheduler, num_gpus
    """
    # set mandatory hyperparameters values with those in config file if they exist
    lr = params['training']['learning_rate']
    assert lr is not None and lr > 0, "missing mandatory learning rate parameter"
    weight_decay = params['training']['weight_decay']
    assert weight_decay is not None and weight_decay >= 0, "missing mandatory weight decay parameter"
    step_size = params['training']['step_size']
    assert step_size is not None and step_size > 0, "missing mandatory step size parameter"
    gamma = params['training']['gamma']
    assert gamma is not None and gamma >= 0, "missing mandatory gamma parameter"

    # optional hyperparameters. Set to None if not in config file
    class_weights = torch.tensor(
        params['training']
        ['class_weights']) if params['training']['class_weights'] else None
    if params['training']['class_weights']:
        verify_weights(params['global']['num_classes'], class_weights)
    ignore_index = -100
    if params['training']['ignore_index'] is not None:
        ignore_index = params['training']['ignore_index']

    # Loss function
    criterion = MultiClassCriterion(loss_type=params['training']['loss_fn'],
                                    ignore_index=ignore_index,
                                    weight=class_weights)

    # Optimizer
    opt_fn = params['training']['optimizer']
    optimizer = create_optimizer(params=model.parameters(),
                                 mode=opt_fn,
                                 base_lr=lr,
                                 weight_decay=weight_decay)
    lr_scheduler = optim.lr_scheduler.StepLR(optimizer=optimizer,
                                             step_size=step_size,
                                             gamma=gamma)

    if checkpoint:
        model, optimizer = load_from_checkpoint(checkpoint,
                                                model,
                                                optimizer=optimizer)

    return model, criterion, optimizer, lr_scheduler
Пример #4
0
modelname = config_path.stem
output_dir = Path('../model') / modelname
output_dir.mkdir(exist_ok=True)
log_dir = Path('../logs') / modelname
log_dir.mkdir(exist_ok=True)

logger = debug_logger(log_dir)
logger.debug(config)
logger.info(f'Device: {device}')
logger.info(f'Max Epoch: {max_epoch}')

# Loss
print('Initializing loss function, optimizer and scheduler...')
loss_fn = MultiClassCriterion(**loss_config).to(device)
params = model.parameters()
optimizer, scheduler = create_optimizer(params, **opt_config)

# history
if resume:
    with open(log_dir.joinpath('history.pkl'), 'rb') as f:
        history_dict = pickle.load(f)
        best_metrics = history_dict['best_metrics']
        loss_history = history_dict['loss']
        iou_history = history_dict['iou']
        start_epoch = len(iou_history)
        for _ in range(start_epoch):
            scheduler.step()
else:
    start_epoch = 0
    best_metrics = 0
    loss_history = []
Пример #5
0
def main():
    config_path = Path(args.config_path)
    config = yaml.load(open(config_path))

    net_config = config['Net']
    data_config = config['Data']
    train_config = config['Train']

    # Config for data:
    train_dir = data_config["train_dir"]
    train_name = data_config["train_name"]
    train_type = data_config["train_type"]

    val_dir = data_config["val_dir"]
    val_name = data_config["val_name"]
    val_type = data_config["val_type"]

    target_size = data_config["target_size"]
    num_workers = data_config["num_worker"]

    # Config for train:
    num_epoch = train_config["num_epoch"]
    batch_size = train_config["batch_size"]
    val_every = train_config["val_every"]
    resume = train_config["resume"]
    pretrained_path = train_config["pretrained_path"]
    saved_dir = train_config["saved_dir"]
    epoch_start = 0
    loss_type = train_config["loss_type"]
    optimizer_config = train_config["optimizer"]

    del data_config
    del train_config

    model = load_model(**net_config)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"[INFO] Device: {device}")
    # To device
    model = model.to(device)
    # if torch.cuda.is_available():
    #     model.cuda()

    modelname = config_path.stem
    output_dir = Path(saved_dir) / "models" / modelname
    output_dir.mkdir(parents=True, exist_ok=True)
    log_dir = Path(saved_dir) / "logs" / modelname
    log_dir.mkdir(parents=True, exist_ok=True)

    # logger = debug_logger(log_dir)
    # logger.debug(config)
    # logger.info(f'Device: {device}')
    # logger.info(f'Max Epoch: {max_epoch}')

    loss_fn = Criterion(loss_type=loss_type).to(device)
    params = model.parameters()
    optimizer, scheduler = create_optimizer(params, **optimizer_config)

    # Dataset
    affine_augmenter = albu.Compose([
        albu.GaussNoise(var_limit=(0, 25), p=.2),
        albu.GaussianBlur(3, p=0.2),
        albu.JpegCompression(50, 100, p=0.2)
    ])

    image_augmenter = albu.Compose([
        albu.OneOf([
            albu.RandomBrightnessContrast(0.25, 0.25),
            albu.CLAHE(clip_limit=2),
            albu.RandomGamma(),
        ],
                   p=0.5),
        albu.HueSaturationValue(hue_shift_limit=20,
                                sat_shift_limit=30,
                                val_shift_limit=20,
                                p=0.2),
        albu.RGBShift(p=0.2),
        albu.RandomSizedCrop(min_max_height=[45, 64],
                             height=64,
                             width=64,
                             p=0.5),
    ])

    train_dataset = load_dataset(data_type=train_type,
                                 base_dir=train_dir,
                                 filename=train_name,
                                 n_class=net_config['n_class'],
                                 target_size=target_size,
                                 affine_augmenter=affine_augmenter,
                                 image_augmenter=image_augmenter,
                                 debug=False)
    val_dataset = load_dataset(data_type=val_type,
                               base_dir=val_dir,
                               filename=val_name,
                               n_class=net_config['n_class'],
                               target_size=target_size,
                               debug=False)

    train_loader = DataLoader(train_dataset,
                              batch_size=batch_size,
                              num_workers=num_workers,
                              shuffle=True,
                              pin_memory=True,
                              drop_last=True)
    valid_loader = DataLoader(val_dataset,
                              batch_size=batch_size,
                              shuffle=False,
                              num_workers=num_workers,
                              pin_memory=True)

    if torch.cuda.is_available():
        model = nn.DataParallel(model)

    if resume:
        checkpoint = torch.load(pretrained_path)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        epoch_start = checkpoint['epoch'] + 1
        loss_history = checkpoint['loss_history']
    else:
        loss_history = []

    model.train()
    for i_epoch in range(epoch_start, num_epoch):
        print(f"Epoch: {i_epoch}")
        print(f'Learning rate: {optimizer.param_groups[0]["lr"]}')
        train_losses = []
        train_diffs = []

        model.train()
        with tqdm(train_loader) as _tqdm:
            for batched in _tqdm:
                optimizer.zero_grad()

                if loss_type == "RANK":

                    img1, img2, lbl1, lbl2, labels = batched
                    img1, img2, lbl1, lbl2, labels = img1.to(device), img2.to(
                        device), lbl1.to(device), lbl2.to(device), labels.to(
                            device)

                    preds1 = model(img1)
                    preds2 = model(img2)

                    preds1 = preds1.to(device)
                    preds2 = preds2.to(device)

                    loss = loss_fn([preds1, preds2], [lbl1, lbl2, labels])

                    diff = calculate_diff(preds1, lbl1)
                    diff += calculate_diff(preds2, lbl2)
                    diff /= 2

                    _tqdm.set_postfix(
                        OrderedDict(loss=f'{loss.item():.3f}',
                                    mae=f'{diff:.1f}'))
                    train_losses.append(loss.item())
                    history_ploter(train_losses, log_dir.joinpath('loss.png'))
                    train_diffs.append(diff)

                    loss.backward()
                    optimizer.step()

                elif loss_type == "MSE" or loss_type == "wrapped":
                    img1, lbl1, _, _, _ = batched
                    img1, lbl1 = img1.to(device), lbl1.to(device)

                    if net_config["net_type"] == "Perceiver":
                        img1 = img1.permute(0, 2, 3, 1)

                    preds1 = model(img1)

                    loss = loss_fn([preds1, []], [lbl1, []])
                    diff = calculate_diff(preds1, lbl1)

                    _tqdm.set_postfix(
                        OrderedDict(loss=f'{loss.item():.3f}',
                                    mae=f'{diff:.1f}'))
                    train_losses.append(loss.item())
                    history_ploter(train_losses, log_dir.joinpath('loss.png'))
                    train_diffs.append(diff)

                    loss.backward()
                    optimizer.step()

        train_loss = np.mean(train_losses)
        train_diff = np.nanmean(train_diffs)

        print(f'[INFO] train loss: {train_loss}')
        print(f'[INFO] train diff: {train_diff}')

        scheduler.step()

        if (i_epoch + 1) % val_every == 0:
            valid_losses = []
            valid_diffs = []
            model.eval()
            with torch.no_grad():
                with tqdm(valid_loader) as _tqdm:
                    for batched in _tqdm:

                        images, labels, _, _, _ = batched

                        if net_config["net_type"] == "Perceiver":
                            images = images.permute(0, 2, 3, 1)

                        images, labels = images.to(device), labels.to(device)

                        preds = model(images)

                        # loss = loss_fn([preds], [labels])

                        diff = calculate_diff(preds, labels)

                        _tqdm.set_postfix(OrderedDict(mae=f'{diff:.2f}'))
                        # _tqdm.set_postfix(OrderedDict(loss=f'{loss.item():.3f}', d_y=f'{np.mean(diff[:,0]):.1f}', d_p=f'{np.mean(diff[:,1]):.1f}', d_r=f'{np.mean(diff[:,2]):.1f}'))

                        valid_diffs.append(diff)

            valid_diff = np.mean(valid_diffs)
            loss_history.append([train_diff, valid_diff])
            history_ploter(loss_history, log_dir.joinpath('diff.png'))
            print(f'[INFO] valid diff: {valid_diff}')

            torch.save(
                model.state_dict(),
                output_dir.joinpath(f'model_epoch_{i_epoch}_{valid_diff}.pth'))
            torch.save(
                {
                    'epoch': i_epoch,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'loss_history': loss_history,
                },
                output_dir.joinpath(
                    f'checkpoint_epoch_{i_epoch}_{valid_diff}.pth'))

        else:
            valid_diff = None
Пример #6
0
def main():
    config_path = Path(args.config_path)
    config = yaml.load(open(config_path))

    net_config = config['Net']
    data_config = config['Data']
    train_config = config['Train']
    loss_config = config['Loss']
    opt_config = config['Optimizer']
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    n_class = net_config['n_class']
    max_epoch = train_config['max_epoch']
    batch_size = train_config['batch_size']
    num_workers = train_config['num_workers']
    test_every = train_config['test_every']
    resume = train_config['resume']
    pretrained_path = train_config['pretrained_path']
    use_rank = train_config['use_rank']
    use_bined = train_config['use_bined']
    del train_config['use_rank']
    del train_config['use_bined']

    train_dir = data_config['train_dir']
    val_dir = data_config['val_dir']
    train_name = data_config['train_name']
    val_name = data_config['val_name']
    train_type = data_config['train_type']
    val_type = data_config['val_type']
    del data_config['train_dir']
    del data_config['val_dir']
    del data_config['train_name']
    del data_config['val_name']
    del data_config['train_type']
    del data_config['val_type']

    model = load_model(**net_config)

    # To device
    model = model.to(device)

    modelname = config_path.stem
    output_dir = Path('../model') / modelname
    output_dir.mkdir(exist_ok=True)
    log_dir = Path('../logs') / modelname
    log_dir.mkdir(exist_ok=True)

    logger = debug_logger(log_dir)
    logger.debug(config)
    logger.info(f'Device: {device}')
    logger.info(f'Max Epoch: {max_epoch}')

    loss_fn = Criterion(**loss_config).to(device)
    params = model.parameters()
    optimizer, scheduler = create_optimizer(params, **opt_config)

    # history
    if resume:
        with open(log_dir.joinpath('history.pkl'), 'rb') as f:
            history_dict = pickle.load(f)
            best_metrics = history_dict['best_metrics']
            loss_history = history_dict['loss']
            diff_history = history_dict['diff']
            # start_epoch = len(diff_history)
            start_epoch = 47
            for _ in range(start_epoch):
                scheduler.step()

    else:
        start_epoch = 0
        best_metrics = float('inf')
        loss_history = []
        diff_history = []


    # Dataset
    affine_augmenter = albu.Compose([albu.GaussNoise(var_limit=(0,25),p=.2),
                                    albu.GaussianBlur(3, p=0.2),
                                    albu.JpegCompression(50, 100, p=0.2)])

    image_augmenter = albu.Compose([
                                    albu.OneOf([
                                        albu.RandomBrightnessContrast(0.25,0.25),
                                        albu.CLAHE(clip_limit=2),
                                        albu.RandomGamma(),
                                        ], p=0.5),
                                    albu.HueSaturationValue(hue_shift_limit=20, sat_shift_limit=30, val_shift_limit=20,p=0.2),
                                    albu.RGBShift(p=0.2),
                                    ])
    # image_augmenter = None
    train_dataset = laod_dataset(data_type=train_type, affine_augmenter=affine_augmenter, image_augmenter=image_augmenter,
                            base_dir=train_dir, filename=train_name, use_bined=use_bined, n_class=n_class, **data_config)

    valid_dataset = laod_dataset(data_type=val_type, split='valid', base_dir=val_dir, filename=val_name, 
                            use_bined=use_bined, n_class=n_class, **data_config)

    # top_10 = len(train_dataset) // 10
    # top_30 = len(train_dataset) // 3.33
    # train_weights = [ 3 if idx<top_10 else 2 if idx<top_30 else 1 for idx in train_dataset.labels_sort_idx]
    # train_sample = WeightedRandomSampler(train_weights, num_samples=len(train_dataset), replacement=True)

    # train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=train_sample, num_workers=num_workers,
    #                           pin_memory=True, drop_last=True)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset, batch_size=1, shuffle=False, num_workers=num_workers, pin_memory=True)

    if torch.cuda.is_available():
        model = nn.DataParallel(model)

    # Pretrained model
    if pretrained_path:
        logger.info(f'Load pretrained from {pretrained_path}')
        param = torch.load(pretrained_path, map_location='cpu')
        if "state_dict" in param:
            model.load_state_dict(param['state_dict'], strict=False)
        else:
            model.load_state_dict(param)
        del param

    # Restore model
    if resume:
        print("[INFO] resume training.")
        model_path = output_dir.joinpath(f'model_epoch_{start_epoch-1}.pth')
        logger.info(f'Resume from {model_path}')
        param = torch.load(model_path, map_location='cpu')
        model.load_state_dict(param)
        del param
        opt_path = output_dir.joinpath(f'opt_epoch_{start_epoch-1}.pth')
        param = torch.load(opt_path)
        optimizer.load_state_dict(param)
        del param


    file_train_log = open("file_train_log.txt", "a")
    file_val_log = open("file_val_log.txt", "a")
    # Train
    for i_epoch in range(start_epoch, max_epoch):
        logger.info(f'Epoch: {i_epoch}')
        logger.info(f'Learning rate: {optimizer.param_groups[0]["lr"]}')

        train_losses = []
        train_diffs = []
        model.train()
        with tqdm(train_loader) as _tqdm:
            for batched in _tqdm:
                optimizer.zero_grad()

                if use_rank:
                    if use_bined:
                        img1, img2, lbl1, lbl2, labels, yaw_lbl1, pitch_lbl1, roll_lbl1, yaw_lbl2, pitch_lbl2, roll_lbl2 = batched
                        img1, img2, lbl1, lbl2, labels = img1.to(device),img2.to(device),lbl1.to(device),lbl2.to(device),labels.to(device)
                        yaw_lbl1, pitch_lbl1, roll_lbl1 = yaw_lbl1.to(device), pitch_lbl1.to(device), roll_lbl1.to(device)
                        yaw_lbl2, pitch_lbl2, roll_lbl2 = yaw_lbl2.to(device), pitch_lbl2.to(device), roll_lbl2.to(device)
                        
                        preds1, y_pres1, p_pres1, r_pres1 = model(img1, True)
                        preds2, y_pres2, p_pres2, r_pres2 = model(img2, True)
                        
                        pre_list = [preds1,preds2,y_pres1,p_pres1,r_pres1,y_pres2,p_pres2,r_pres2]
                        lbl_list = [lbl1,lbl2,yaw_lbl1,pitch_lbl1,roll_lbl1,yaw_lbl2,pitch_lbl2,roll_lbl2,labels]
                        loss = loss_fn(pre_list, lbl_list, use_bined=True)
                    else:
                        img1, img2, lbl1, lbl2, labels = batched
                        img1, img2, lbl1, lbl2, labels = img1.to(device),img2.to(device),lbl1.to(device),lbl2.to(device),labels.to(device)

                        preds1 = model(img1, False)
                        preds2 = model(img2, False)
                        
                        loss = loss_fn([preds1,preds2], [lbl1,lbl2,labels], use_bined=False)

                        # print(f"Preds1: {preds1}")
                        # print(f"Preds2: {preds2}")
                        # print(f"lib1: {lbl1}")
                        # print(f"lib2: {lbl2}")

                    diff = calculate_diff(preds1, lbl1)
                    diff += calculate_diff(preds2, lbl2)
                    diff /= 2
                    # print(f"Diff: {diff}")
                    
                elif use_bined:
                    images, labels, yaw_labels, pitch_labels, roll_labels = batched
                
                    images, labels = images.to(device), labels.to(device)
                    yaw_labels, pitch_labels, roll_labels = yaw_labels.to(device), pitch_labels.to(device), roll_labels.to(device)

                    preds, y_pres, p_pres, r_pres = model(images, use_bined)
                
                    loss = loss_fn([preds, y_pres, p_pres, r_pres], [labels, yaw_labels, pitch_labels, roll_labels], use_bined)

                    diff = calculate_diff(preds, labels)
                else:
                    images, labels = batched
                
                    images, labels = images.to(device), labels.to(device)

                    preds = model(images, use_bined)
                
                    loss = loss_fn([preds], [labels])

                    diff = calculate_diff(preds, labels, mean=True)

                _tqdm.set_postfix(OrderedDict(loss=f'{loss.item():.3f}', mae=f'{diff:.1f}'))
                train_losses.append(loss.item())
                train_diffs.append(diff)

                loss.backward()
                optimizer.step()

        scheduler.step()

        train_loss = np.mean(train_losses)
        train_diff = np.nanmean(train_diffs)
        logger.info(f'train loss: {train_loss}')
        logger.info(f'train diff: {train_diff}')
        file_train_log.write(f"{train_loss},{train_diff}")

        # torch.save(model.module.state_dict(), output_dir.joinpath(f'model_tmp_epoch_{i_epoch}.pth'))
        # torch.save(optimizer.state_dict(), output_dir.joinpath(f'opt_tmp_{i_epoch}.pth'))

        if (i_epoch + 1) % test_every == 0:
            valid_losses = []
            valid_diffs = []
            model.eval()
            with torch.no_grad():
                with tqdm(valid_loader) as _tqdm:
                    for batched in _tqdm:
                        if use_bined:
                            images, labels, yaw_labels, pitch_labels, roll_labels = batched
                        
                            images, labels = images.to(device), labels.to(device)
                            # yaw_labels, pitch_labels, roll_labels = yaw_labels.to(device), pitch_labels.to(device), roll_labels.to(device)

                            preds, y_pres, p_pres, r_pres = model(images, use_bined)
                        
                            # loss = loss_fn([preds, y_pres, p_pres, r_pres], [labels, yaw_labels, pitch_labels, roll_labels])

                            diff = calculate_diff(preds, labels)
                        else:
                            images, labels = batched
                        
                            images, labels = images.to(device), labels.to(device)

                            preds = model(images, use_bined)
                        
                            # loss = loss_fn([preds], [labels])

                            diff = calculate_diff(preds, labels)
                        
                        _tqdm.set_postfix(OrderedDict(mae=f'{diff:.2f}'))
                        # _tqdm.set_postfix(OrderedDict(loss=f'{loss.item():.3f}', d_y=f'{np.mean(diff[:,0]):.1f}', d_p=f'{np.mean(diff[:,1]):.1f}', d_r=f'{np.mean(diff[:,2]):.1f}'))
                        valid_losses.append(0)
                        valid_diffs.append(diff)

            valid_loss = np.mean(valid_losses)
            valid_diff = np.mean(valid_diffs)
            logger.info(f'valid seg loss: {valid_loss}')
            logger.info(f'valid diff: {valid_diff}')
            file_val_log.write(f"{valid_loss},{valid_diff}")

            if best_metrics >= valid_diff:
                best_metrics = valid_diff
                logger.info('Best Model!\n')
                torch.save(model.state_dict(), output_dir.joinpath(f'model_epoch_{i_epoch}_{valid_diff}.pth'))
                torch.save(optimizer.state_dict(), output_dir.joinpath(f'opt_epoch_{i_epoch}_{valid_diff}.pth'))
            
            torch.save(model.state_dict(), output_dir.joinpath(f'model_epoch_{i_epoch}_{valid_diff}.pth'))
            torch.save(optimizer.state_dict(), output_dir.joinpath(f'opt_epoch_{i_epoch}_{valid_diff}.pth'))

        else:
            valid_loss = None
            valid_diff = None

        loss_history.append([train_loss, valid_loss])
        diff_history.append([train_diff, valid_diff])
        history_ploter(loss_history, log_dir.joinpath('loss.png'))
        history_ploter(diff_history, log_dir.joinpath('diff.png'))

        history_dict = {'loss': loss_history,
                        'diff': diff_history,
                        'best_metrics': best_metrics}
        with open(log_dir.joinpath('history.pkl'), 'wb') as f:
            pickle.dump(history_dict, f)

    file_train_log.close()
    file_val_log.close()
Пример #7
0
    output_dir.mkdir(exist_ok=True, parents=True)
    log_dir = Path('../logs').joinpath(modelname)
    log_dir.mkdir(exist_ok=True, parents=True)

    logger = debug_logger(log_dir)
    logger.info(f'Device: {device}')
    logger.info(f'Max Epoch: {max_epoch}')

    del data_config['dataset']
    train_dataset = Dataset(split='train', **data_config)
    valid_dataset = Dataset(split='valid', **data_config)
    train_loader = DataLoader(train_dataset,
                              batch_size=batch_size,
                              shuffle=True,
                              num_workers=4,
                              pin_memory=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=batch_size,
                              shuffle=False,
                              num_workers=4,
                              pin_memory=True)

    if 'unet' in net_config['dec_type']:
        model = EncoderDecoderNet(**net_config).to(device)
    else:
        model = SPPNet(**net_config).to(device)
    loss_fn = CrossEntropy2d(**loss_config).to(device)
    optimizer, scheduler = create_optimizer(model=model, **opt_config)

    train()
Пример #8
0
def set_hyperparameters(params, model, state_dict_path):
    """
    Function to set hyperparameters based on values provided in yaml config file.
    Will also set model to GPU, if available.
    If none provided, default functions values are used.
    :param params: (dict) Parameters found in the yaml config file
    :param model: Model loaded from model_choice.py
    :param state_dict_path: (str) Full file path to the state dict
    :return: model, criterion, optimizer, lr_scheduler, num_gpus
    """

    # assign default values to hyperparameters
    loss_signature = inspect.signature(nn.CrossEntropyLoss).parameters
    optim_signature = inspect.signature(optim.Adam).parameters
    lr_scheduler_signature = inspect.signature(
        optim.lr_scheduler.StepLR).parameters
    class_weights = loss_signature['weight'].default
    ignore_index = loss_signature['ignore_index'].default
    lr = optim_signature['lr'].default
    weight_decay = optim_signature['weight_decay'].default
    step_size = lr_scheduler_signature['step_size'].default
    if not isinstance(step_size, int):
        step_size = params['training']['num_epochs'] + 1
    gamma = lr_scheduler_signature['gamma'].default
    num_devices = 0

    # replace default values by those in config file if they exist
    if params['training']['class_weights']:
        class_weights = torch.tensor(params['training']['class_weights'])
        verify_weights(params['global']['num_classes'], class_weights)
    if params['training']['ignore_index']:
        ignore_index = params['training']['ignore_index']
    if params['training']['learning_rate']:
        lr = params['training']['learning_rate']
    if params['training']['weight_decay']:
        weight_decay = params['training']['weight_decay']
    if params['training']['step_size']:
        step_size = params['training']['step_size']
    if params['training']['gamma']:
        gamma = params['training']['gamma']
    if params['global']['num_gpus']:
        num_devices = params['global']['num_gpus']

    # Loss function
    criterion = MultiClassCriterion(loss_type=params['training']['loss_fn'],
                                    ignore_index=ignore_index,
                                    weight=class_weights)

    # list of GPU devices that are available and unused. If no GPUs, returns empty list
    lst_device_ids = get_device_ids(
        num_devices) if torch.cuda.is_available() else []
    num_devices = len(lst_device_ids) if lst_device_ids else 0
    device = torch.device(f'cuda:{lst_device_ids[0]}' if torch.cuda.
                          is_available() and lst_device_ids else 'cpu')

    if num_devices == 1:
        print(f"Using Cuda device {lst_device_ids[0]}")
    elif num_devices > 1:
        print(f"Using data parallel on devices {str(lst_device_ids)[1:-1]}")
        model = nn.DataParallel(model, device_ids=lst_device_ids
                                )  # adds prefix 'module.' to state_dict keys
    else:
        warnings.warn(
            f"No Cuda device available. This process will only run on CPU")

    criterion = criterion.to(device)
    model = model.to(device)

    # Optimizer
    opt_fn = params['training']['optimizer']
    optimizer = create_optimizer(params=model.parameters(),
                                 mode=opt_fn,
                                 base_lr=lr,
                                 weight_decay=weight_decay)
    lr_scheduler = optim.lr_scheduler.StepLR(optimizer=optimizer,
                                             step_size=step_size,
                                             gamma=gamma)

    if state_dict_path != '':
        model, optimizer = load_from_checkpoint(state_dict_path,
                                                model,
                                                optimizer=optimizer)

    return model, criterion, optimizer, lr_scheduler, device, num_devices
Пример #9
0
    # load data
    data_loader = DataLoader(dataset,
                             batch_size=opts.batch_size,
                             shuffle=True,
                             num_workers=opts.num_workers,
                             drop_last=False)
    test_data_loader = DataLoader(test_dataset,
                                  batch_size=opts.batch_size,
                                  shuffle=True,
                                  num_workers=opts.num_workers,
                                  drop_last=False)

    # initialize criterion and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer, linear_scheduler, constant_scheduler = create_optimizer(
        net, opts, len(dataset))

    df = pd.DataFrame(columns=[
        'epoch', 'train_loss', 'train_acc', 'val_loss', 'val_acc', 'f1_neg',
        'train_time', 'val_time'
    ])
    logger.info('Start training...')
    best_checkpoint = 0.0

    # train
    for epoch in range(opts.epochs):
        t0 = time.time()
        epoch = epoch + 1
        total = 0
        total_loss = 0.0
        train_preds = None
Пример #10
0
def process(config_path):
    gc.collect()
    torch.cuda.empty_cache()
    config = yaml.load(open(config_path))
    net_config = config['Net']
    data_config = config['Data']
    train_config = config['Train']
    loss_config = config['Loss']
    opt_config = config['Optimizer']
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    t_max = opt_config['t_max']

    # Collect training parameters
    max_epoch = train_config['max_epoch']
    batch_size = train_config['batch_size']
    fp16 = train_config['fp16']
    resume = train_config['resume']
    pretrained_path = train_config['pretrained_path']
    freeze_enabled = train_config['freeze']
    seed_enabled = train_config['seed']

    #########################################
    # Deterministic training
    if seed_enabled:
        seed = 100
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        np.random.seed(seed=seed)
        import random
        random.seed(a=100)
    #########################################

    # Network
    if 'unet' in net_config['dec_type']:
        net_type = 'unet'
        model = EncoderDecoderNet(**net_config)
    else:
        net_type = 'deeplab'
        net_config['output_channels'] = 19
        model = SPPNet(**net_config)

    dataset = data_config['dataset']
    if dataset == 'deepglobe-dynamic':
        from dataset.deepglobe_dynamic import DeepGlobeDatasetDynamic as Dataset
        net_config['output_channels'] = 7
        classes = np.arange(0, 7)
    else:
        raise NotImplementedError
    del data_config['dataset']

    modelname = config_path.stem
    timestamp = datetime.timestamp(datetime.now())
    print("timestamp =", datetime.fromtimestamp(timestamp))
    output_dir = Path(os.path.join(ROOT_DIR, f'model/{modelname}_{datetime.fromtimestamp(timestamp)}') )
    output_dir.mkdir(exist_ok=True)
    log_dir = Path(os.path.join(ROOT_DIR, f'logs/{modelname}_{datetime.fromtimestamp(timestamp)}') )
    log_dir.mkdir(exist_ok=True)
    dataset_dir= '/home/sfoucher/DEV/pytorch-segmentation/data/deepglobe_as_pascalvoc/VOCdevkit/VOC2012'
    logger = debug_logger(log_dir)
    logger.debug(config)
    logger.info(f'Device: {device}')
    logger.info(f'Max Epoch: {max_epoch}')

    # Loss
    loss_fn = MultiClassCriterion(**loss_config).to(device)
    params = model.parameters()
    optimizer, scheduler = create_optimizer(params, **opt_config)

    # history
    if resume:
        with open(log_dir.joinpath('history.pkl'), 'rb') as f:
            history_dict = pickle.load(f)
            best_metrics = history_dict['best_metrics']
            loss_history = history_dict['loss']
            iou_history = history_dict['iou']
            start_epoch = len(iou_history)
            for _ in range(start_epoch):
                scheduler.step()
    else:
        start_epoch = 0
        best_metrics = 0
        loss_history = []
        iou_history = []


    affine_augmenter = albu.Compose([albu.HorizontalFlip(p=.5),albu.VerticalFlip(p=.5)
                                    # Rotate(5, p=.5)
                                    ])
    # image_augmenter = albu.Compose([albu.GaussNoise(p=.5),
    #                                 albu.RandomBrightnessContrast(p=.5)])
    image_augmenter = None

    # This has been put in the loop for the dynamic training

    """
    # Dataset
    train_dataset = Dataset(affine_augmenter=affine_augmenter, image_augmenter=image_augmenter,
                            net_type=net_type, **data_config)
    valid_dataset = Dataset(split='valid', net_type=net_type, **data_config)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4,
                            pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset, batch_size=1, shuffle=False, num_workers=4, pin_memory=True)
    """

    

    # Pretrained model
    if pretrained_path:
        logger.info(f'Resume from {pretrained_path}')
        param = torch.load(pretrained_path)
        model.load_state_dict(param)
        model.logits = torch.nn.Conv2d(256, net_config['output_channels'], 1)
        del param

    # To device
    model = model.to(device)

    #########################################
    if freeze_enabled:
        # Code de Rémi
        # Freeze layers
        for param_index in range(int((len(optimizer.param_groups[0]['params']))*0.5)):
            optimizer.param_groups[0]['params'][param_index].requires_grad = False
    #########################################
        params_to_update = model.parameters()
        print("Params to learn:")
        if freeze_enabled:
            params_to_update = []
            for name,param in model.named_parameters():
                if param.requires_grad == True:
                    params_to_update.append(param)
                    print("\t",name)
        optimizer, scheduler = create_optimizer(params_to_update, **opt_config)

    # fp16
    if fp16:
        # I only took the necessary files because I don't need the C backend of apex,
        # which is broken and can't be installed
        # from apex import fp16_utils
        from utils.apex.apex.fp16_utils.fp16util import BN_convert_float
        from utils.apex.apex.fp16_utils.fp16_optimizer import FP16_Optimizer
        # model = fp16_utils.BN_convert_float(model.half())
        model = BN_convert_float(model.half())
        # optimizer = fp16_utils.FP16_Optimizer(optimizer, verbose=False, dynamic_loss_scale=True)
        optimizer = FP16_Optimizer(optimizer, verbose=False, dynamic_loss_scale=True)
        logger.info('Apply fp16')

    # Restore model
    if resume:
        model_path = output_dir.joinpath(f'model_tmp.pth')
        logger.info(f'Resume from {model_path}')
        param = torch.load(model_path)
        model.load_state_dict(param)
        del param
        opt_path = output_dir.joinpath(f'opt_tmp.pth')
        param = torch.load(opt_path)
        optimizer.load_state_dict(param)
        del param
    i_iter = 0
    ma_loss= 0
    ma_iou= 0
    # Train
    for i_epoch in range(start_epoch, max_epoch):
        logger.info(f'Epoch: {i_epoch}')
        logger.info(f'Learning rate: {optimizer.param_groups[0]["lr"]}')

        train_losses = []
        train_ious = []
        model.train()

        # Initialize randomized but balanced datasets
        train_dataset = Dataset(base_dir = dataset_dir,
                                affine_augmenter=affine_augmenter, image_augmenter=image_augmenter,
                                net_type=net_type, **data_config)
        valid_dataset = Dataset(base_dir = dataset_dir,
                                split='valid', net_type=net_type, **data_config)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4,
                                pin_memory=True, drop_last=True)
        valid_loader = DataLoader(valid_dataset, batch_size=1, shuffle=False, num_workers=4, pin_memory=True)

        with tqdm(train_loader) as _tqdm:
            for i, batched in enumerate(_tqdm):
                images, labels = batched
                if fp16:
                    images = images.half()
                images, labels = images.to(device), labels.to(device)
                optimizer.zero_grad()
                preds = model(images)
                if net_type == 'deeplab':
                    preds = F.interpolate(preds, size=labels.shape[1:], mode='bilinear', align_corners=True)
                if fp16:
                    loss = loss_fn(preds.float(), labels)
                else:
                    loss = loss_fn(preds, labels)

                preds_np = preds.detach().cpu().numpy()
                labels_np = labels.detach().cpu().numpy()
                iou = compute_iou_batch(np.argmax(preds_np, axis=1), labels_np, classes)

                _tqdm.set_postfix(OrderedDict(seg_loss=f'{loss.item():.5f}', iou=f'{iou:.3f}'))
                train_losses.append(loss.item())
                train_ious.append(iou)
                ma_loss= 0.01*loss.item() +  0.99 * ma_loss
                ma_iou= 0.01*iou +  0.99 * ma_iou
                plotter.plot('loss', 'train', 'iteration Loss', i_iter, loss.item())
                plotter.plot('iou', 'train', 'iteration iou', i_iter, iou)
                plotter.plot('loss', 'ma_loss', 'iteration Loss', i_iter, ma_loss)
                plotter.plot('iou', 'ma_iou', 'iteration iou', i_iter, ma_iou)
                if fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                optimizer.step()
                i_iter += 1
        scheduler.step()

        train_loss = np.mean(train_losses)
        train_iou = np.nanmean(train_ious)
        logger.info(f'train loss: {train_loss}')
        logger.info(f'train iou: {train_iou}')
        plotter.plot('loss-epoch', 'train', 'iteration Loss', i_epoch, train_loss)
        plotter.plot('iou-epoch', 'train', 'iteration iou', i_epoch, train_iou)
        torch.save(model.state_dict(), output_dir.joinpath('model_tmp.pth'))
        torch.save(optimizer.state_dict(), output_dir.joinpath('opt_tmp.pth'))

        valid_losses = []
        valid_ious = []
        model.eval()
        with torch.no_grad():
            with tqdm(valid_loader) as _tqdm:
                for batched in _tqdm:
                    images, labels = batched
                    if fp16:
                        images = images.half()
                    images, labels = images.to(device), labels.to(device)
                    preds = model.tta(images, net_type=net_type)
                    if fp16:
                        loss = loss_fn(preds.float(), labels)
                    else:
                        loss = loss_fn(preds, labels)

                    preds_np = preds.detach().cpu().numpy()
                    labels_np = labels.detach().cpu().numpy()

                    # I changed a parameter in the compute_iou method to prevent it from yielding nans
                    iou = compute_iou_batch(np.argmax(preds_np, axis=1), labels_np, classes)

                    _tqdm.set_postfix(OrderedDict(seg_loss=f'{loss.item():.5f}', iou=f'{iou:.3f}'))
                    valid_losses.append(loss.item())
                    valid_ious.append(iou)

        valid_loss = np.mean(valid_losses)
        valid_iou = np.mean(valid_ious)
        logger.info(f'valid seg loss: {valid_loss}')
        logger.info(f'valid iou: {valid_iou}')
        plotter.plot('loss-epoch', 'valid', 'iteration Loss', i_epoch, valid_loss)
        plotter.plot('iou-epoch', 'valid', 'iteration iou', i_epoch, valid_iou)
        if best_metrics < valid_iou:
            best_metrics = valid_iou
            logger.info('Best Model!')
            torch.save(model.state_dict(), output_dir.joinpath('model.pth'))
            torch.save(optimizer.state_dict(), output_dir.joinpath('opt.pth'))

        loss_history.append([train_loss, valid_loss])
        iou_history.append([train_iou, valid_iou])
        history_ploter(loss_history, log_dir.joinpath('loss.png'))
        history_ploter(iou_history, log_dir.joinpath('iou.png'))

        history_dict = {'loss': loss_history,
                        'iou': iou_history,
                        'best_metrics': best_metrics}
        with open(log_dir.joinpath('history.pkl'), 'wb') as f:
            pickle.dump(history_dict, f)