def train_model( model, criterion, optimizer, LR, num_epochs, dataloaders, dataset_sizes, weight_decay, weighted_cross_entropy_batchwise=False, fine_tune=False, regression=False): """ Fine tunes torchvision model to NIH CXR data. Args: model: torchvision model to be finetuned (densenet-121 in this case) criterion: loss criterion (binary cross entropy loss, BCELoss) optimizer: optimizer to use in training (SGD) LR: learning rate num_epochs: continue training up to this many epochs dataloaders: pytorch train and val dataloaders dataset_sizes: length of train and val datasets weight_decay: weight decay parameter we use in SGD with momentum Returns: model: trained torchvision model best_epoch: epoch on which best model val loss was obtained """ since = time.time() start_epoch = 1 best_loss = 999999 best_epoch = -1 last_train_loss = -1 tensorboard_writer_train = SummaryWriter('runs/loss/train_loss') tensorboard_writer_val = SummaryWriter('runs/loss/val_loss') if not fine_tune: PRED_LABEL = [ 'Atelectasis', 'Cardiomegaly', 'Effusion', 'Infiltration', 'Mass', 'Nodule', 'Pneumonia', 'Pneumothorax', 'Consolidation', 'Edema', 'Emphysema', 'Fibrosis', 'Pleural_Thickening', 'Hernia'] else: PRED_LABEL = [ 'Detector01', 'Detector2', 'Detector3'] if not regression: tensorboard_writer_auc = {} tensorboard_writer_AP = {} for label in PRED_LABEL: tensorboard_writer_auc[label] = SummaryWriter('runs/auc/'+label) tensorboard_writer_AP[label] = SummaryWriter('runs/ap/' + label) else: tensorboard_writer_mae = SummaryWriter('runs/mae') # iterate over epochs for epoch in range(start_epoch, num_epochs + 1): print('Epoch {}/{}'.format(epoch, num_epochs)) print('-' * 10) # set model to train or eval mode based on whether we are in train or # val; necessary to get correct predictions given batchnorm for phase in ['train', 'val']: if phase == 'train': model.train(True) else: model.train(False) running_loss = 0.0 total_done = 0 for data in dataloaders[phase]: if not regression: inputs, labels, _ = data else: inputs, ground_truths, _ = data batch_size = inputs.shape[0] inputs = inputs.to(device) if not regression: labels = (labels.to(device)).float() else: ground_truths = (ground_truths.to(device)).float() with torch.set_grad_enabled(phase == 'train'): outputs = model(inputs) # calculate gradient and update parameters in train phase optimizer.zero_grad() if weighted_cross_entropy_batchwise: beta = pos_neg_weights_in_batch(labels) criterion = nn.BCEWithLogitsLoss(pos_weight=beta) if not regression: loss = criterion(outputs, labels) else: ground_truths = ground_truths.unsqueeze(1) loss = criterion(outputs, ground_truths) if phase == 'train': loss.backward() optimizer.step() running_loss += loss.item() * batch_size epoch_loss = running_loss / dataset_sizes[phase] if phase == 'train': tensorboard_writer_train.add_scalar('Loss', epoch_loss, epoch) last_train_loss = epoch_loss elif phase == 'val': tensorboard_writer_val.add_scalar('Loss', epoch_loss, epoch) if not regression: preds, aucs = E.make_pred_multilabel(dataloaders['val'], model, save_as_csv=False, fine_tune=fine_tune) aucs.set_index('label', inplace=True) print(aucs) for label in PRED_LABEL: tensorboard_writer_auc[label].add_scalar('AUC', aucs.loc[label, 'auc'], epoch) tensorboard_writer_AP[label].add_scalar('AP', aucs.loc[label, 'AP'], epoch) else: mae, _, _ = E.evaluate_mae(dataloaders['val'], model) print('MAE: ', mae) tensorboard_writer_mae.add_scalar('MAE', mae, epoch) print(phase + ' epoch {}:loss {:.4f} with data size {}'.format( epoch, epoch_loss, dataset_sizes[phase])) # checkpoint model if has best val loss yet if phase == 'val' and epoch_loss < best_loss: best_loss = epoch_loss best_epoch = epoch if not fine_tune: checkpoint(model, best_loss, epoch, LR, filename='checkpoint_best') elif fine_tune and not regression: checkpoint(model, best_loss, epoch, LR, filename='classification_checkpoint_best') else: checkpoint(model, best_loss, epoch, LR, filename='regression_checkpoint_best') # log training and validation loss over each epoch with open("results/log_train", 'a') as logfile: logwriter = csv.writer(logfile, delimiter=',') if epoch == 1: logwriter.writerow(["epoch", "train_loss", "val_loss"]) logwriter.writerow([epoch, last_train_loss, epoch_loss]) # Save model after each epoch # checkpoint(model, best_loss, epoch, LR, filename='checkpoint') total_done += batch_size if total_done % (100 * batch_size) == 0: print("completed " + str(total_done) + " so far in epoch") # print elapsed time from the beginning after each epoch print('Training complete in {:.0f}m {:.0f}s'.format( (time.time() - since) // 60, (time.time() - since) % 60)) # total time time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) # load best model weights to return if not fine_tune: checkpoint_best = torch.load('results/checkpoint_best') elif fine_tune and not regression: checkpoint_best = torch.load('results/classification_checkpoint_best') else: checkpoint_best = torch.load('results/regression_checkpoint_best') model = checkpoint_best['model'] return model, best_epoch
def train_cnn(PATH_TO_IMAGES, LR, WEIGHT_DECAY): """ Train torchvision model to NIH data given high level hyperparameters. Args: PATH_TO_IMAGES: path to NIH images LR: learning rate WEIGHT_DECAY: weight decay parameter for SGD Returns: preds: torchvision model predictions on test fold with ground truth for comparison aucs: AUCs for each train,test tuple """ NUM_EPOCHS = 100 #100 BATCH_SIZE = 32 #16 try: rmtree('results/') except BaseException: pass # directory doesn't yet exist, no need to clear it os.makedirs("results/") # use imagenet mean,std for normalization mean = [0.485, 0.456, 0.406] std = [0.229, 0.224, 0.225] N_LABELS = 15 # we are predicting 15 labels. Originally 14 before adding Covid # load labels df = pd.read_csv("nih_labels_modified.csv", index_col=0) # define torchvision transforms data_transforms = { 'train': transforms.Compose([ transforms.RandomHorizontalFlip(), transforms.Resize(224), #was transforms.Scale # because scale doesn't always give 224 x 224, this ensures 224 x # 224 transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean, std) ]), 'val': transforms.Compose([ transforms.Resize(224), #was transforms.Scale transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean, std) ]), } # create train/val dataloaders transformed_datasets = {} transformed_datasets['train'] = CXR.CXRDataset( path_to_images=PATH_TO_IMAGES, fold='train', transform=data_transforms['train']) transformed_datasets['val'] = CXR.CXRDataset( path_to_images=PATH_TO_IMAGES, fold='val', transform=data_transforms['val']) dataloaders = {} dataloaders['train'] = torch.utils.data.DataLoader( transformed_datasets['train'], batch_size=BATCH_SIZE, shuffle=True, num_workers=8) dataloaders['val'] = torch.utils.data.DataLoader( transformed_datasets['val'], batch_size=BATCH_SIZE, shuffle=True, num_workers=8) # please do not attempt to train without GPU as will take excessively long if not use_gpu: raise ValueError("Error, requires GPU") model = models.densenet121(pretrained=True) num_ftrs = model.classifier.in_features # add final layer with # outputs in same dimension of labels with sigmoid # activation model.classifier = nn.Sequential(nn.Linear(num_ftrs, N_LABELS), nn.Sigmoid()) # put model on GPU model = model.cuda() # define criterion, optimizer for training criterion = nn.BCELoss() optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=LR, momentum=0.9, weight_decay=WEIGHT_DECAY) dataset_sizes = {x: len(transformed_datasets[x]) for x in ['train', 'val']} # train model model, best_epoch = train_model(model, criterion, optimizer, LR, num_epochs=NUM_EPOCHS, dataloaders=dataloaders, dataset_sizes=dataset_sizes, weight_decay=WEIGHT_DECAY) # get preds and AUCs on test fold preds, aucs = E.make_pred_multilabel(data_transforms, model, PATH_TO_IMAGES) return preds, aucs
def train_full(PATH_TO_IMAGES, LR, WEIGHT_DECAY): """ Train torchvision model to NIH data given high level hyperparameters. Args: PATH_TO_IMAGES: path to NIH images LR: learning rate WEIGHT_DECAY: weight decay parameter for SGD Returns: preds: torchvision model predictions on test fold with ground truth for comparison aucs: AUCs for each train,test tuple """ #========================================== # Initialization #========================================== NUM_EPOCHS = 100 BATCH_SIZE = 16 try: rmtree('results/') except BaseException: pass # directory doesn't yet exist, no need to clear it os.makedirs("results/") # use imagenet mean,std for normalization mean = [0.485, 0.456, 0.406] std = [0.229, 0.224, 0.225] # 0 - Does not have pneumonia, and 1 - positive for pneumonia N_LABELS = 2 #========================================== # Load labels #========================================== print("- Loading Data") df = pd.read_csv("rsna_labels.csv", index_col=0) #========================================== # Define torchvision transforms #========================================== print("- Transforming Images") data_transforms = { 'train': transforms.Compose([ transforms.RandomHorizontalFlip(), transforms.Scale(224), # because scale doesn't always give 224 x 224, this ensures 224 x # 224 transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean, std) ]), 'val': transforms.Compose([ transforms.Scale(224), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean, std) ]), } #========================================== # Create train/val dataloaders #========================================== transformed_datasets = {} transformed_datasets['train'] = CXR.RSNA_Dataset( path_to_images=PATH_TO_IMAGES, mode='train', transform=data_transforms['train']) transformed_datasets['val'] = CXR.RSNA_Dataset( path_to_images=PATH_TO_IMAGES, mode='val', transform=data_transforms['val']) dataloaders = {} dataloaders['train'] = torch.utils.data.DataLoader( transformed_datasets['train'], batch_size=BATCH_SIZE, shuffle=True, num_workers=8) dataloaders['val'] = torch.utils.data.DataLoader( transformed_datasets['val'], batch_size=BATCH_SIZE, shuffle=True, num_workers=8) # please do not attempt to train without GPU as will take excessively long if not use_gpu: raise ValueError("Error, requires GPU") model = models.densenet121(pretrained=False) #========================================== # Loading cheXnet weights #========================================== PATH_TO_MODEL = "pretrained/checkpoint" checkpoint = torch.load(PATH_TO_MODEL, map_location=lambda storage, loc: storage) model = checkpoint['model'] del checkpoint model.cpu() #========================================== # Addapting last Layer and adding Metadata #========================================== # calculating the input of the new layer num_ftrs = model.classifier[0].in_features # switching the classifier for one with only 2 classes model.classifier = nn.Sequential(nn.Linear(num_ftrs, N_LABELS), nn.Softmax()) #========================================== # Put model on GPU #========================================== model = model.cuda() #========================================== # Define criterion, optimizer for training #========================================== criterion = nn.BCELoss() optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=LR, momentum=0.9, weight_decay=WEIGHT_DECAY) dataset_sizes = {x: len(transformed_datasets[x]) for x in ['train', 'val']} #========================================== # Train model #========================================== model, best_epoch = train_model(model, criterion, optimizer, LR, num_epochs=NUM_EPOCHS, dataloaders=dataloaders, dataset_sizes=dataset_sizes, weight_decay=WEIGHT_DECAY) #========================================== # Get preds and AUCs on test fold #========================================== preds, aucs = E.make_pred_multilabel(data_transforms, model, PATH_TO_IMAGES) return preds, aucs, model
def train_cnn(PATH_TO_IMAGES, LR, WEIGHT_DECAY, fine_tune=False, regression=False, freeze=False, adam=False, initial_model_path=None, initial_brixia_model_path=None, weighted_cross_entropy_batchwise=False, modification=None, weighted_cross_entropy=False): """ Train torchvision model to NIH data given high level hyperparameters. Args: PATH_TO_IMAGES: path to NIH images LR: learning rate WEIGHT_DECAY: weight decay parameter for SGD Returns: preds: torchvision model predictions on test fold with ground truth for comparison aucs: AUCs for each train,test tuple """ NUM_EPOCHS = 100 BATCH_SIZE = 32 try: rmtree('results/') except BaseException: pass # directory doesn't yet exist, no need to clear it os.makedirs("results/") # use imagenet mean,std for normalization mean = [0.485, 0.456, 0.406] std = [0.229, 0.224, 0.225] N_LABELS = 14 # we are predicting 14 labels N_COVID_LABELS = 3 # we are predicting 3 COVID labels # define torchvision transforms data_transforms = { 'train': transforms.Compose([ # transforms.RandomHorizontalFlip(), transforms.Resize(224), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean, std) ]), 'val': transforms.Compose([ transforms.Resize(224), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean, std) ]), } # create train/val dataloaders transformed_datasets = {} transformed_datasets['train'] = CXR.CXRDataset( path_to_images=PATH_TO_IMAGES, fold='train', transform=data_transforms['train'], fine_tune=fine_tune, regression=regression) transformed_datasets['val'] = CXR.CXRDataset( path_to_images=PATH_TO_IMAGES, fold='val', transform=data_transforms['val'], fine_tune=fine_tune, regression=regression) dataloaders = {} dataloaders['train'] = torch.utils.data.DataLoader( transformed_datasets['train'], batch_size=BATCH_SIZE, shuffle=True, num_workers=8) dataloaders['val'] = torch.utils.data.DataLoader( transformed_datasets['val'], batch_size=BATCH_SIZE, shuffle=True, num_workers=8) # please do not attempt to train without GPU as will take excessively long if not use_gpu: raise ValueError("Error, requires GPU") if initial_model_path or initial_brixia_model_path: if initial_model_path: saved_model = torch.load(initial_model_path) else: saved_model = torch.load(initial_brixia_model_path) model = saved_model['model'] del saved_model if fine_tune and not initial_brixia_model_path: num_ftrs = model.module.classifier.in_features if freeze: for feature in model.module.features: for param in feature.parameters(): param.requires_grad = False if feature == model.module.features.transition2: break if not regression: model.module.classifier = nn.Linear(num_ftrs, N_COVID_LABELS) else: model.module.classifier = nn.Sequential( nn.Linear(num_ftrs, 1), nn.ReLU(inplace=True) ) else: model = models.densenet121(pretrained=True) num_ftrs = model.classifier.in_features model.classifier = nn.Linear(num_ftrs, N_LABELS) if modification == 'transition_layer': # num_ftrs = model.features.norm5.num_features up1 = torch.nn.Sequential(torch.nn.ConvTranspose2d(num_ftrs, num_ftrs, kernel_size=3, stride=2, padding=1), torch.nn.BatchNorm2d(num_ftrs), torch.nn.ReLU(True)) up2 = torch.nn.Sequential(torch.nn.ConvTranspose2d(num_ftrs, num_ftrs, kernel_size=3, stride=2, padding=1), torch.nn.BatchNorm2d(num_ftrs)) transition_layer = torch.nn.Sequential(up1, up2) model.features.add_module('transition_chestX', transition_layer) if modification == 'remove_last_block': model.features.denseblock4 = nn.Sequential() model.features.transition3 = nn.Sequential() # model.features.norm5 = nn.BatchNorm2d(512) # model.classifier = nn.Linear(512, N_LABELS) if modification == 'remove_last_two_block': model.features.denseblock4 = nn.Sequential() model.features.transition3 = nn.Sequential() model.features.transition2 = nn.Sequential() model.features.denseblock3 = nn.Sequential() model.features.norm5 = nn.BatchNorm2d(512) model.classifier = nn.Linear(512, N_LABELS) print(model) # put model on GPU if not initial_model_path: model = nn.DataParallel(model) model.to(device) if regression: criterion = nn.MSELoss() else: if weighted_cross_entropy: pos_weights = transformed_datasets['train'].pos_neg_balance_weights() print(pos_weights) # pos_weights[pos_weights>40] = 40 criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weights) else: criterion = nn.BCEWithLogitsLoss() if adam: optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=LR, weight_decay=WEIGHT_DECAY) else: optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=LR, weight_decay=WEIGHT_DECAY, momentum=0.9) dataset_sizes = {x: len(transformed_datasets[x]) for x in ['train', 'val']} # train model if regression: model, best_epoch = train_model(model, criterion, optimizer, LR, num_epochs=NUM_EPOCHS, dataloaders=dataloaders, dataset_sizes=dataset_sizes, weight_decay=WEIGHT_DECAY, fine_tune=fine_tune, regression=regression) else: model, best_epoch = train_model(model, criterion, optimizer, LR, num_epochs=NUM_EPOCHS, dataloaders=dataloaders, dataset_sizes=dataset_sizes, weight_decay=WEIGHT_DECAY, weighted_cross_entropy_batchwise=weighted_cross_entropy_batchwise, fine_tune=fine_tune) # get preds and AUCs on test fold preds, aucs = E.make_pred_multilabel(dataloaders['val'], model, save_as_csv=False, fine_tune=fine_tune) return preds, aucs
TRAIN_DATASET = sys.argv[1] TEST_DATASET = sys.argv[2] if (TRAIN_DATASET not in DATASET_NAMES) or (TEST_DATASET not in DATASET_NAMES): raise Exception( 'Invalid dataset name, needs to be one of the following: nih, chexpert, mimic' ) # torchvision transforms data_transforms = get_data_transforms() PATH_TO_MODEL = '/A/eduardo/projects/cxr_gen/' + TRAIN_DATASET + '/model/modelinf.pt' PATH_TO_IMAGES = '/A/eduardo/datasets/' + TEST_DATASET N_LABELS = 14 # load model model = models.densenet121(pretrained=True) num_ftrs = model.classifier.in_features # add final layer with # outputs in same dimension of labels with sigmoid # activation model.classifier = nn.Sequential(nn.Linear(num_ftrs, N_LABELS), nn.Sigmoid()) model.load_state_dict(torch.load(PATH_TO_MODEL)) # put model on GPU model = model.cuda() preds = E.make_pred_multilabel(data_transforms, model, PATH_TO_IMAGES, TRAIN_DATASET, TEST_DATASET) E.calc_aucs(preds, TRAIN_DATASET, TEST_DATASET)
def train_model(model, criterion, optimizer, LR, num_epochs, dataloaders, dataset_sizes, weight_decay, dataset, data_transforms, PATH_TO_IMAGES, PATH_TO_CSV, val_on_dataset=False): """ Fine tunes torchvision model to CheXpert data. Args: model: torchvision model to be finetuned (densenet-121 in this case) criterion: loss criterion (binary cross entropy loss, BCELoss) optimizer: optimizer to use in training (Adam) LR: learning rate num_epochs: continue training up to this many epochs dataloaders: pytorch train and val dataloaders dataset_sizes: length of train and val datasets weight_decay: weight decay parameter we use in SGD with momentum Returns: model: trained torchvision model best_epoch: epoch on which best model val loss was obtained """ since = time.time() start_epoch = 1 best_loss = 999999 best_val_acc = 0 best_epoch = -1 last_train_loss = -1 last_val_acc = 0 if val_on_dataset: print("WARNING: VALIDATING ON DATASET") with open("results/logger", 'a') as logfile: logfile.write("WARNING: VALIDATING ON DATASET\n") print(time.strftime("%d %b %Y %H:%M:%S", time.gmtime(time.time() - 25200))) with open("results/logger", 'a') as logfile: logfile.write( time.strftime("%d %b %Y %H:%M:%S\n", time.gmtime(time.time() - 25200))) # iterate over epochs for epoch in range(start_epoch, num_epochs + 1): print('Epoch {}/{}'.format(epoch, num_epochs)) print('-' * 10) with open("results/logger", 'a') as logfile: logfile.write('Epoch {}/{}\n'.format(epoch, num_epochs)) logfile.write('-' * 10 + '\n') running_loss = 0.0 running_misclass = 0 i = 0 total_done = 0 model.train(True) print("Model train: ", model.training) # iterate over all data in train/val dataloader: for data in dataloaders['train']: i += 1 inputs, labels, _ = data #print(labels) batch_size = inputs.shape[0] inputs = Variable(inputs.cuda()) if str(criterion) == str(nn.BCELoss()): labels = Variable(labels.cuda()).float() else: labels = Variable(labels.cuda()).long() outputs = model(inputs) # calculate gradient and update parameters in train phase optimizer.zero_grad() loss = criterion(outputs, labels) loss.backward() optimizer.step() running_loss += loss.data.item() * batch_size #if phase == 'val' and str(criterion) == str(nn.CrossEntropyLoss()): # print(labels) # idx = torch.argmax(outputs, dim=1) # print(idx) epoch_loss = running_loss / dataset_sizes['train'] print( time.strftime("%d %b %Y %H:%M:%S", time.gmtime(time.time() - 25200))) with open("results/logger", 'a') as logfile: logfile.write( time.strftime("%d %b %Y %H:%M:%S\n", time.gmtime(time.time() - 25200))) print('train epoch {}: loss {:.4f} with data size {}'.format( epoch, epoch_loss, dataset_sizes['train'])) with open("results/logger", 'a') as logfile: logfile.write( 'train epoch {}: loss {:.4f} with data size {}\n'.format( epoch, epoch_loss, dataset_sizes['train'])) time_elapsed = time.time() - since print('train epoch complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) with open("results/logger", 'a') as logfile: logfile.write('train epoch complete in {:.0f}m {:.0f}s\n'.format( time_elapsed // 60, time_elapsed % 60)) last_train_loss = epoch_loss # keep track of best train loss if epoch_loss < best_loss: best_loss = epoch_loss # done with training if str(criterion) == 'BCELoss()': if val_on_dataset: _, metric = E.make_pred_multilabel(data_transforms, model, PATH_TO_IMAGES, PATH_TO_CSV, 'auc', dataset=dataset) else: _, metric = E.make_pred_multilabel(data_transforms, model, PATH_TO_IMAGES, PATH_TO_CSV, 'auc') else: _, metric = E.make_pred_multilabel(data_transforms, model, PATH_TO_IMAGES, PATH_TO_CSV, 'auc', dataset=dataset, multiclass=True) auc = metric.as_matrix(columns=metric.columns[1:]) last_val_acc = auc[~np.isnan(auc)].mean() print(metric) with open("results/logger", 'a') as logfile: print(metric, file=logfile) print('mean epoch validation accuracy:', last_val_acc) with open("results/logger", 'a') as logfile: logfile.write('mean epoch validation accuracy: ' + str(last_val_acc) + '\n') # decay learning rate if no val accuracy improvement in this epoch if last_val_acc < best_val_acc: print("Running with LR decay on val accuracy") with open("results/logger", 'a') as logfile: logfile.write("Running with LR decay on val accuracy\n") print("decay loss from " + str(LR) + " to " + str(LR / 10) + " as not seeing improvement in val accuracy") with open("results/logger", 'a') as logfile: logfile.write("decay loss from " + str(LR) + " to " + str(LR / 10) + " as not seeing improvement in val accuracy\n") LR = LR / 10 optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=LR, betas=(0.9, 0.999), eps=1e-08, weight_decay=weight_decay) print("created new optimizer with LR " + str(LR)) with open("results/logger", 'a') as logfile: logfile.write("created new optimizer with LR " + str(LR) + '\n') # track best val accuracy yet if last_val_acc > best_val_acc: best_val_acc = last_val_acc best_epoch = epoch print('saving checkpoint_' + str(epoch)) with open("results/logger", 'a') as logfile: logfile.write('saving checkpoint_' + str(epoch) + '\n') checkpoint(model, last_train_loss, last_val_acc, metric, epoch, best_epoch, LR, weight_decay) # log training loss over each epoch with open("results/log_train", 'a') as logfile: logwriter = csv.writer(logfile, delimiter=',') if (epoch == 1): logwriter.writerow(["epoch", "train_loss", "average auc"]) logwriter.writerow([epoch, last_train_loss, last_val_acc]) print("best epoch: ", best_epoch) with open("results/logger", 'a') as logfile: logfile.write("best epoch: " + str(best_epoch) + '\n') print("best train loss: ", best_loss) with open("results/logger", 'a') as logfile: logfile.write("best train loss: " + str(best_loss) + '\n') print("best val accuracy: ", best_val_acc) with open("results/logger", 'a') as logfile: logfile.write("best val accuracy: " + str(best_val_acc) + '\n') total_done += batch_size if (total_done % (100 * batch_size) == 0): print("completed " + str(total_done) + " so far in epoch") with open("results/logger", 'a') as logfile: logfile.write("completed " + str(total_done) + " so far in epoch\n") # break if no val loss improvement in 3 epochs if ((epoch - best_epoch) >= 3): print("no improvement in 3 epochs, break") with open("results/logger", 'a') as logfile: logfile.write("no improvement in 3 epochs, break\n") break time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) with open("results/logger", 'a') as logfile: logfile.write('Training complete in {:.0f}m {:.0f}s\n'.format( time_elapsed // 60, time_elapsed % 60)) # load best model weights to return checkpoint_best = torch.load('results/checkpoint_' + str(best_epoch)) model = checkpoint_best['model'] return model, best_epoch
def run(PATH_TO_IMAGES, LR, WEIGHT_DECAY, opt): """ Train torchvision model to NIH data given high level hyperparameters. Args: PATH_TO_IMAGES: path to NIH images LR: learning rate WEIGHT_DECAY: weight decay parameter for SGD Returns: preds: torchvision model predictions on test fold with ground truth for comparison aucs: AUCs for each train,test tuple """ use_gpu = torch.cuda.is_available() gpu_count = torch.cuda.device_count() print("Available GPU count:" + str(gpu_count)) wandb.init(project=opt.project, name=opt.run_name) wandb.config.update(opt, allow_val_change=True) NUM_EPOCHS = 60 BATCH_SIZE = opt.batch_size if opt.eval_only: # test only. it is okay to have duplicate run_path os.makedirs(opt.run_path, exist_ok=True) else: # train from scratch, should not have the same run_path. Otherwise it will overwrite previous runs. try: os.makedirs(opt.run_path) except FileExistsError: print("[ERROR] run_path {} exists. try to assign a unique run_path".format(opt.run_path)) return None, None except Exception as e: print("exception while creating run_path {}".format(opt.run_path)) print(str(e)) return None, None # use imagenet mean,std for normalization mean = [0.485, 0.456, 0.406] std = [0.229, 0.224, 0.225] N_LABELS = 14 # we are predicting 14 labels # define torchvision transforms if opt.random_crop: data_transforms = { 'train': transforms.Compose([ transforms.RandomHorizontalFlip(), transforms.RandomResizedCrop(size=opt.input_size, scale=(0.8, 1.0)), # crop then resize transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2), transforms.ToTensor(), transforms.Normalize(mean, std) ]), 'val': transforms.Compose([ transforms.Resize(int(opt.input_size * 1.05)), transforms.CenterCrop(opt.input_size), transforms.ToTensor(), transforms.Normalize(mean, std) ]), } else: data_transforms = { 'train': transforms.Compose([ transforms.RandomHorizontalFlip(), transforms.Resize(opt.input_size), transforms.ToTensor(), transforms.Normalize(mean, std) ]), 'val': transforms.Compose([ transforms.Resize(opt.input_size), transforms.ToTensor(), transforms.Normalize(mean, std) ]), } # create train/val dataloaders transformed_datasets = {} transformed_datasets['train'] = CXR.CXRDataset( path_to_images=PATH_TO_IMAGES, fold='train', transform=data_transforms['train']) transformed_datasets['val'] = CXR.CXRDataset( path_to_images=PATH_TO_IMAGES, fold='val', transform=data_transforms['val']) worker_init_fn = set_seed(opt) dataloaders = {} dataloaders['train'] = torch.utils.data.DataLoader( transformed_datasets['train'], batch_size=BATCH_SIZE, shuffle=True, num_workers=30, drop_last=True, worker_init_fn=worker_init_fn ) dataloaders['val'] = torch.utils.data.DataLoader( transformed_datasets['val'], batch_size=BATCH_SIZE, shuffle=False, num_workers=30, drop_last=True, worker_init_fn=worker_init_fn ) # please do not attempt to train without GPU as will take excessively long if not use_gpu: raise ValueError("Error, requires GPU") # load model model = load_model(N_LABELS, opt) # define criterion, optimizer for training criterion = nn.BCELoss() optimizer = create_optimizer(model, LR, WEIGHT_DECAY, opt) scheduler = lr_scheduler.ReduceLROnPlateau( optimizer, 'max', factor=opt.lr_decay_ratio, patience=opt.patience, verbose=True ) dataset_sizes = {x: len(transformed_datasets[x]) for x in ['train', 'val']} if opt.eval_only: print("loading best model statedict") # load best model weights to return checkpoint_best = torch.load(os.path.join(opt.run_path, 'checkpoint')) model = load_model(N_LABELS, opt=opt) model.load_state_dict(checkpoint_best['state_dict']) else: # train model model, best_epoch = train_model( model, criterion, optimizer, LR, scheduler=scheduler, num_epochs=NUM_EPOCHS, dataloaders=dataloaders, dataset_sizes=dataset_sizes, PATH_TO_IMAGES=PATH_TO_IMAGES, data_transforms=data_transforms, opt=opt, ) # get preds and AUCs on test fold preds, aucs = E.make_pred_multilabel( data_transforms, model, PATH_TO_IMAGES, fold="test", opt=opt, ) wandb.log({ 'val_official': np.average(list(aucs.auc)) }) return preds, aucs
def train_model( model, criterion, optimizer, LR, scheduler, num_epochs, dataloaders, dataset_sizes, PATH_TO_IMAGES, data_transforms, opt, ): """ Fine tunes torchvision model to NIH CXR data. Args: model: torchvision model to be finetuned (densenet-121 in this case) criterion: loss criterion (binary cross entropy loss, BCELoss) optimizer: optimizer to use in training (SGD) LR: learning rate num_epochs: continue training up to this many epochs dataloaders: pytorch train and val dataloaders dataset_sizes: length of train and val datasets weight_decay: weight decay parameter we use in SGD with momentum Returns: model: trained torchvision model best_epoch: epoch on which best model val loss was obtained """ since = time.time() start_epoch = 1 best_auc = -1 best_epoch = -1 last_train_loss = -1 # iterate over epochs for epoch in range(start_epoch, num_epochs + 1): print('Epoch {}/{}(max)'.format(epoch, num_epochs)) print('-' * 10) # set model to train or eval mode based on whether we are in train or val # necessary to get correct predictions given batchnorm for phase in ['train', 'val']: print('Epoch %03d, ' % epoch, phase) if phase == 'train': model.train(True) else: model.train(False) running_loss = 0.0 i = 0 total_done = 0 # iterate over all data in train/val dataloader: data_length = len(dataloaders[phase]) for data_idx, data in enumerate(dataloaders[phase]): inputs, labels, _ = data batch_size = inputs.shape[0] if phase == 'val': with torch.no_grad(): inputs = inputs.cuda(opt.gpu_ids[0]) labels = labels.cuda(opt.gpu_ids[0]).float() outputs = model(inputs) if isinstance(outputs, tuple): # has dot product outputs, dp = outputs else: dp = None # calculate gradient and update parameters in train phase optimizer.zero_grad() loss = criterion(outputs, labels) else: inputs = inputs.cuda(opt.gpu_ids[0]) labels = labels.cuda(opt.gpu_ids[0]).float() outputs = model(inputs) if isinstance(outputs, tuple): # has dot product outputs, dp = outputs else: dp = None # calculate gradient and update parameters in train phase optimizer.zero_grad() loss = criterion(outputs, labels) if dp is not None: dp_loss = opt.orth_loss_lambda * torch.abs(dp.mean()) loss = loss + dp_loss if phase == 'train': loss.backward() optimizer.step() if data_idx % 20 == 0: wandb.log({ 'epoch': epoch + data_idx / float(len(dataloaders[phase])), 'loss': loss.cpu(), 'lr': list(optimizer.param_groups)[0]['lr'] }) if data_idx == 0: log_images = [] for image in list(inputs[:10].cpu()): log_images.append(wandb.Image( np.transpose(image.numpy(), (1, 2, 0)), caption='{}_image'.format(phase) )) wandb.log({'{}_image'.format(phase): log_images}) running_loss += loss.data.item() * batch_size if data_idx % 100 == 0: print("{} / {} ".format(data_idx, data_length), end="\r", flush=True) epoch_loss = running_loss / dataset_sizes[phase] if phase == 'train': last_train_loss = epoch_loss print(phase + ' epoch {}:loss {:.4f} with data size {}'.format( epoch, epoch_loss, dataset_sizes[phase])) # decay learning rate if no val loss improvement in this epoch if phase == 'val': pred, auc = E.make_pred_multilabel( data_transforms, model, PATH_TO_IMAGES, fold="val", opt=opt, ) wandb.log({ 'epoch': epoch + 1, 'performance': np.average(list(auc.auc)) }) epoch_auc = np.average(list(auc.auc)) scheduler.step(epoch_auc) # checkpoint model if phase == 'val' and epoch_auc > best_auc: # best_loss = epoch_loss best_auc = epoch_auc best_epoch = epoch checkpoint(model, best_auc, epoch, LR, opt) # log training and validation loss over each epoch if phase == 'val': with open(os.path.join(opt.run_path, "log_train"), 'a') as logfile: logwriter = csv.writer(logfile, delimiter=',') if (epoch == 1): logwriter.writerow(["epoch", "train_loss", "val_loss"]) logwriter.writerow([epoch, last_train_loss, epoch_loss]) total_done += batch_size if (total_done % (100 * batch_size) == 0): print("completed " + str(total_done) + " so far in epoch") # break if no val loss improvement in 3 epochs if np.round(list(optimizer.param_groups)[0]['lr'], 5) <= np.round( LR * (opt.lr_decay_ratio ** opt.num_lr_drops), 5): break time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) # load best model weights to return checkpoint_best = torch.load(os.path.join(opt.run_path, 'checkpoint')) model = load_model(N_LABELS=14, opt=opt) model.load_state_dict(checkpoint_best['state_dict']) return model, best_epoch
def train_cnn(PATH_TO_IMAGES, LR, WEIGHT_DECAY): """ Train torchvision model to NIH data given high level hyperparameters. Args: PATH_TO_IMAGES: path to NIH images LR: learning rate WEIGHT_DECAY: weight decay parameter for SGD Returns: preds: torchvision model predictions on test fold with ground truth for comparison aucs: AUCs for each train,test tuple """ NUM_EPOCHS = 100 BATCH_SIZE = 16 try: rmtree('results/') except BaseException: pass # directory doesn't yet exist, no need to clear it os.makedirs("results/") # use imagenet mean,std for normalization mean = [0.485, 0.456, 0.406] std = [0.229, 0.224, 0.225] N_LABELS = 14 # we are predicting 14 labels # load labels df = pd.read_csv("nih_labels.csv", index_col=0) # define torchvision transforms data_transforms = { 'train': transforms.Compose([ transforms.RandomHorizontalFlip(), transforms.Scale(224), # because scale doesn't always give 224 x 224, this ensures 224 x # 224 transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean, std) ]), 'val': transforms.Compose([ transforms.Scale(224), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean, std) ]), } # create train/val dataloaders transformed_datasets = {} transformed_datasets['train'] = CXR.CXRDataset( path_to_images=PATH_TO_IMAGES, fold='train', transform=data_transforms['train']) transformed_datasets['val'] = CXR.CXRDataset( path_to_images=PATH_TO_IMAGES, fold='val', transform=data_transforms['val']) dataloaders = {} dataloaders['train'] = torch.utils.data.DataLoader( transformed_datasets['train'], batch_size=BATCH_SIZE, shuffle=True, num_workers=8) dataloaders['val'] = torch.utils.data.DataLoader( transformed_datasets['val'], batch_size=BATCH_SIZE, shuffle=True, num_workers=8) # please do not attempt to train without GPU as will take excessively long if not use_gpu: raise ValueError("Error, requires GPU") model = models.densenet121(pretrained=True) num_ftrs = model.classifier.in_features # add final layer with # outputs in same dimension of labels with sigmoid # activation model.classifier = nn.Sequential( nn.Linear(num_ftrs, N_LABELS), nn.Sigmoid()) # put model on GPU model = model.cuda() # define criterion, optimizer for training criterion = nn.BCELoss() optimizer = optim.SGD( filter( lambda p: p.requires_grad, model.parameters()), lr=LR, momentum=0.9, weight_decay=WEIGHT_DECAY) dataset_sizes = {x: len(transformed_datasets[x]) for x in ['train', 'val']} # train model model, best_epoch = train_model(model, criterion, optimizer, LR, num_epochs=NUM_EPOCHS, dataloaders=dataloaders, dataset_sizes=dataset_sizes, weight_decay=WEIGHT_DECAY) # get preds and AUCs on test fold preds, aucs = E.make_pred_multilabel( data_transforms, model, PATH_TO_IMAGES) return preds, aucs