def train_net(net, device, epochs=250, batch_size=4, lr=0.0001, save_cp=True, args=None, input_path=None, test=False): # put flag ^ here dir_img = input_path.dir_img dir_mask = input_path.dir_mask dir_valimg = input_path.dir_valimg dir_valmask = input_path.dir_valmask dir_testimg = input_path.dir_testimg dir_testmask = input_path.dir_testmask dir_externaltestimg = input_path.dir_externaltestimg dir_externaltestmask = input_path.dir_externaltestmask exp_name = args.expname img_scale = args.scale color_map = args.colormap dir_checkpoint = os.path.join(input_path.dir_checkpoint, exp_name) dataset = BasicDataset(dir_img, dir_mask, img_scale, color_map, 'train') dataval = BasicDataset(dir_valimg, dir_valmask, img_scale, color_map, 'val') datatest = BasicDataset(dir_testimg, dir_testmask, img_scale, color_map, 'test') dataexternaltest = BasicDataset(dir_externaltestimg, dir_externaltestmask, img_scale, color_map, 'test') n_val = dataval.__len__() n_train = dataset.__len__() train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True, drop_last=False) val_loader = DataLoader(dataval, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True, drop_last=False) test_loader = DataLoader(datatest, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True, drop_last=False) external_test_loader = DataLoader(dataexternaltest, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True, drop_last=False) writer = SummaryWriter(comment=f'_EXPNAME_{exp_name}') global_step = 0 logging.info(f'''Starting training: Epochs: {epochs} Batch size: {batch_size} Learning rate: {lr} Training size: {n_train} Validation size: {n_val} Checkpoints: {save_cp} Device: {device.type} Images scaling: {img_scale} Color map: {color_map} ''') optimizer = optim.Adam(net.parameters(), lr=lr, betas=(0.9, 0.999)) criterion = nn.CrossEntropyLoss() net.load_state_dict( torch.load("checkpoints/UNet_exp0CP_epoch10.pth", map_location=device)) net.eval() test_score = eval_net(net, test_loader, device, "Output/", True) print(test_score)
def train_net(net, device, epochs=250, batch_size=4, lr=0.0001, save_cp=True, args=None, input_path=None): #assign image path dir_img = input_path.dir_img dir_mask = input_path.dir_mask dir_valimg = input_path.dir_valimg dir_valmask = input_path.dir_valmask dir_testimg = input_path.dir_testimg dir_testmask = input_path.dir_testmask dir_externaltestimg = input_path.dir_externaltestimg dir_externaltestmask = input_path.dir_externaltestmask #assign experimental options exp_name = args.expname img_scale = args.scale color_map = args.colormap dir_checkpoint = os.path.join(input_path.dir_checkpoint, exp_name) dataset = BasicDataset(dir_img, dir_mask, img_scale, color_map, 'train') dataval = BasicDataset(dir_valimg, dir_valmask, img_scale, color_map, 'val') datatest = BasicDataset(dir_testimg, dir_testmask, img_scale, color_map, 'test') dataexternaltest = BasicDataset(dir_externaltestimg, dir_externaltestmask, img_scale, color_map, 'test') # yuankai change it to automated # direct sizes of each training. n_val = dataval.__len__() n_train = dataset.__len__() train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True) val_loader = DataLoader(dataval, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True, drop_last=True) test_loader = DataLoader(datatest, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True, drop_last=True) external_test_loader = DataLoader(dataexternaltest, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True, drop_last=True) writer = SummaryWriter(comment=f'_EXPNAME_{exp_name}') global_step = 0 logging.info(f'''Starting training: Epochs: {epochs} Batch size: {batch_size} Learning rate: {lr} Training size: {n_train} Validation size: {n_val} Checkpoints: {save_cp} Device: {device.type} Images scaling: {img_scale} Color map: {color_map} ''') #yuankai change the optimizer to Adam # optimizer = optim.RMSprop(net.parameters(), lr=lr, weight_decay=1e-8, momentum=0.9) optimizer = optim.Adam(net.parameters(), lr=lr, betas=(0.9, 0.999)) #yuankai remove scheduler # scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min' if net.n_classes > 1 else 'max', patience=2) if net.n_classes > 1: criterion = nn.CrossEntropyLoss() else: criterion = nn.BCEWithLogitsLoss() for epoch in range(epochs): net.train() epoch_loss = 0 with tqdm(total=n_train, desc=f'Epoch {epoch + 1}/{epochs}', unit='img') as pbar: for batch in train_loader: imgs = batch['image'] true_masks = batch['mask'] #yuankai add true_masks_2channel for calculating dice loss true_masks_2channel = true_masks.unsqueeze(1) true_masks_2channel = torch.cat( (1 - true_masks_2channel, true_masks_2channel), 1) true_masks_2channel = true_masks_2channel.to( device=device, dtype=torch.float32) assert imgs.shape[1] == net.n_channels, \ f'Network has been defined with {net.n_channels} input channels, ' \ f'but loaded images have {imgs.shape[1]} channels. Please check that ' \ 'the images are loaded correctly.' imgs = imgs.to(device=device, dtype=torch.float32) if net.n_classes == 1: mask_type = torch.float32 else: mask_type = torch.long true_masks = true_masks.to(device=device, dtype=mask_type) masks_pred = net(imgs) loss_cross = criterion(masks_pred, true_masks) #yuankai add the dice loss loss_dice = 1 + dice_loss(masks_pred, true_masks_2channel) #yuankai sum the two loss together as the final loss loss = loss_dice + loss_cross epoch_loss += loss.item() writer.add_scalar('Loss/train', loss.item(), global_step) pbar.set_postfix( **{ 'loss (batch)': loss.item(), 'loss_cr (batch)': loss_cross.item(), 'loss_dsc (batch)': loss_dice.item() }) # pbar.set_postfix(**{'loss2 (batch)': loss2.item()}) optimizer.zero_grad() loss.backward() nn.utils.clip_grad_value_(net.parameters(), 0.1) optimizer.step() pbar.update(imgs.shape[0]) for tag, value in net.named_parameters(): tag = tag.replace('.', '/') writer.add_histogram('weights/' + tag, value.data.cpu().numpy(), global_step) writer.add_histogram('grads/' + tag, value.grad.data.cpu().numpy(), global_step) val_score = eval_net(net, val_loader, device) test_score = eval_net(net, test_loader, device) external_test_score = eval_net(net, external_test_loader, device) # scheduler.step(val_score) writer.add_scalar('learning_rate', optimizer.param_groups[0]['lr'], global_step) logging.info('Finish Epoch %d/%d' % (epoch, epochs)) logging.info('Validation Dice Coeff: {}'.format(val_score)) writer.add_scalar('Dice/test', val_score, global_step) logging.info('Internal Testing Dice Coeff: {}'.format(test_score)) writer.add_scalar('Loss/test', test_score, global_step) logging.info( 'External Testing Dice Coeff: {}'.format(external_test_score)) writer.add_scalar('Loss/test', external_test_score, global_step) writer.add_images('images', imgs, global_step) if net.n_classes == 1: writer.add_images('masks/true', true_masks, global_step) writer.add_images('masks/pred', torch.sigmoid(masks_pred) > 0.5, global_step) else: writer.add_images('masks/true', true_masks.unsqueeze(1), global_step) writer.add_images('masks/pred', masks_pred.max(dim=1)[1].unsqueeze(1), global_step) # writer.add_images('masks/pred', torch.sigmoid(masks_pred) > 0.5, global_step) if not os.path.exists(dir_checkpoint): os.makedirs(dir_checkpoint) csv_file_name = os.path.join(dir_checkpoint, '%s_result_log.csv' % exp_name) convert_result_to_csv( [epoch, val_score, test_score, external_test_score], csv_file_name) if save_cp and (epoch + 1) % 5 == 0: try: os.mkdir(dir_checkpoint) logging.info('Created checkpoint directory') except OSError: pass torch.save(net.state_dict(), dir_checkpoint + f'CP_epoch{epoch + 1}.pth') logging.info(f'Checkpoint {epoch + 1} saved !') writer.close()
def train_net(net, device, epochs=250, batch_size=4, lr=0.0001, val_percent=0.2, save_cp=True, img_scale=1): dataset = BasicDataset(dir_img, dir_mask, False, img_scale) # pre-processed data: (flip, noise, augmentation, etc): datasetAug = BasicDataset(dir_img, dir_mask, True, img_scale) dataval = BasicDataset(dir_valimg, dir_valmask, False, img_scale) # pre-processed data: (flip, noise, augmentation, etc): datavalAug = BasicDataset(dir_valimg, dir_valmask, True, img_scale) # increasing sample size with augmented data. dataset = dataset + datasetAug dataval = dataval + datavalAug # yuankai change it to automated # direct sizes of each training. n_val = dataval.__len__() n_train = dataset.__len__() train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True) val_loader = DataLoader(dataval, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True, drop_last=True) writer = SummaryWriter(comment=f'LR_{lr}_BS_{batch_size}_SCALE_{img_scale}') global_step = 0 logging.info(f'''Starting training: Epochs: {epochs} Batch size: {batch_size} Learning rate: {lr} Training size: {n_train} Validation size: {n_val} Checkpoints: {save_cp} Device: {device.type} Images scaling: {img_scale} ''') #yuankai change the optimizer to Adam # optimizer = optim.RMSprop(net.parameters(), lr=lr, weight_decay=1e-8, momentum=0.9) optimizer = optim.Adam(net.parameters(), lr=lr, betas=(0.9, 0.999)) #yuankai remove scheduler # scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min' if net.n_classes > 1 else 'max', patience=2) if net.n_classes > 1: criterion = nn.CrossEntropyLoss() else: criterion = nn.BCEWithLogitsLoss() for epoch in range(epochs): net.train() epoch_loss = 0 with tqdm(total=n_train, desc=f'Epoch {epoch + 1}/{epochs}', unit='img') as pbar: for batch in train_loader: imgs = batch['image'] true_masks = batch['mask'] #yuankai add true_masks_2channel for calculating dice loss true_masks_2channel = true_masks.unsqueeze(1) true_masks_2channel = torch.cat((~true_masks_2channel, true_masks_2channel), 1) true_masks_2channel = true_masks_2channel.to(device=device, dtype=torch.float32) assert imgs.shape[1] == net.n_channels, \ f'Network has been defined with {net.n_channels} input channels, ' \ f'but loaded images have {imgs.shape[1]} channels. Please check that ' \ 'the images are loaded correctly.' imgs = imgs.to(device=device, dtype=torch.float32) if net.n_classes == 1: mask_type = torch.float32 else: mask_type = torch.long true_masks = true_masks.to(device=device, dtype=mask_type) masks_pred = net(imgs) loss_cross = criterion(masks_pred, true_masks) #yuankai add the dice loss loss_dice = 1 + dice_loss(masks_pred, true_masks_2channel) #yuankai sum the two loss together as the final loss loss = loss_dice + loss_cross epoch_loss += loss.item() writer.add_scalar('Loss/train', loss.item(), global_step) pbar.set_postfix(**{'loss (batch)': loss.item(), 'loss_cr (batch)': loss_cross.item(), 'loss_dsc (batch)': loss_dice.item()}) # pbar.set_postfix(**{'loss2 (batch)': loss2.item()}) optimizer.zero_grad() loss.backward() nn.utils.clip_grad_value_(net.parameters(), 0.1) optimizer.step() pbar.update(imgs.shape[0]) global_step += 1 if global_step % (len(dataset) // (10 * batch_size)) == 0: for tag, value in net.named_parameters(): tag = tag.replace('.', '/') writer.add_histogram('weights/' + tag, value.data.cpu().numpy(), global_step) writer.add_histogram('grads/' + tag, value.grad.data.cpu().numpy(), global_step) val_score = eval_net(net, val_loader, device) # scheduler.step(val_score) writer.add_scalar('learning_rate', optimizer.param_groups[0]['lr'], global_step) if net.n_classes > 1: logging.info('Validation Dice Coeff: {}'.format(val_score)) writer.add_scalar('Loss/test', val_score, global_step) else: logging.info('Validation Dice Coeff: {}'.format(val_score)) writer.add_scalar('Dice/test', val_score, global_step) writer.add_images('images', imgs, global_step) if net.n_classes == 1: writer.add_images('masks/true', true_masks, global_step) writer.add_images('masks/pred', torch.sigmoid(masks_pred) > 0.5, global_step) else: writer.add_images('masks/true', true_masks.unsqueeze(1), global_step) writer.add_images('masks/pred', masks_pred.max(dim=1)[1].unsqueeze(1), global_step) # writer.add_images('masks/pred', torch.sigmoid(masks_pred) > 0.5, global_step) if save_cp and (epoch + 1) % 10 == 0: try: os.mkdir(dir_checkpoint) logging.info('Created checkpoint directory') except OSError: pass torch.save(net.state_dict(), dir_checkpoint + f'CP_epoch{epoch + 1}.pth') logging.info(f'Checkpoint {epoch + 1} saved !') writer.close()