Пример #1
0
def load_rsna_data(PATH_TO_IMAGES, PATH_TO_MODEL, fold='train'):
    """
    Loads dataloader and torchvision model

    Args:
        PATH_TO_IMAGES: path to RSNA CXR images
        PATH_TO_MODEL: path to downloaded pretrained model or your own retrained model        

    Returns:
        dataloader: dataloader with test examples to show
        model: fine tuned torchvision densenet-121
    """

    checkpoint = torch.load(PATH_TO_MODEL,
                            map_location=lambda storage, loc: storage)
    model = checkpoint['model']
    del checkpoint
    model.cpu()

    # build dataloader on test
    mean = [0.485, 0.456, 0.406]
    std = [0.229, 0.224, 0.225]

    data_transform = transforms.Compose([
        transforms.Scale(224),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean, std)
    ])

    dataset = CXR.RSNA_Dataset(path_to_images=PATH_TO_IMAGES,
                               transform=data_transform,
                               mode=fold)

    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=1,
                                             shuffle=False,
                                             num_workers=1)

    return iter(dataloader), model
Пример #2
0
def train_full(PATH_TO_IMAGES, LR, WEIGHT_DECAY):
    """
    Train torchvision model to NIH data given high level hyperparameters.

    Args:
        PATH_TO_IMAGES: path to NIH images
        LR: learning rate
        WEIGHT_DECAY: weight decay parameter for SGD

    Returns:
        preds: torchvision model predictions on test fold with ground truth for comparison
        aucs: AUCs for each train,test tuple

    """

    #==========================================
    # Initialization
    #==========================================
    NUM_EPOCHS = 100
    BATCH_SIZE = 16

    try:
        rmtree('results/')
    except BaseException:
        pass  # directory doesn't yet exist, no need to clear it
    os.makedirs("results/")

    # use imagenet mean,std for normalization
    mean = [0.485, 0.456, 0.406]
    std = [0.229, 0.224, 0.225]

    # 0 - Does not have pneumonia, and 1 - positive for pneumonia
    N_LABELS = 2

    #==========================================
    # Load labels
    #==========================================
    print("- Loading Data")
    df = pd.read_csv("rsna_labels.csv", index_col=0)

    #==========================================
    # Define torchvision transforms
    #==========================================
    print("- Transforming Images")
    data_transforms = {
        'train':
        transforms.Compose([
            transforms.RandomHorizontalFlip(),
            transforms.Scale(224),
            # because scale doesn't always give 224 x 224, this ensures 224 x
            # 224
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean, std)
        ]),
        'val':
        transforms.Compose([
            transforms.Scale(224),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean, std)
        ]),
    }

    #==========================================
    # Create train/val dataloaders
    #==========================================
    transformed_datasets = {}
    transformed_datasets['train'] = CXR.RSNA_Dataset(
        path_to_images=PATH_TO_IMAGES,
        mode='train',
        transform=data_transforms['train'])
    transformed_datasets['val'] = CXR.RSNA_Dataset(
        path_to_images=PATH_TO_IMAGES,
        mode='val',
        transform=data_transforms['val'])

    dataloaders = {}
    dataloaders['train'] = torch.utils.data.DataLoader(
        transformed_datasets['train'],
        batch_size=BATCH_SIZE,
        shuffle=True,
        num_workers=8)
    dataloaders['val'] = torch.utils.data.DataLoader(
        transformed_datasets['val'],
        batch_size=BATCH_SIZE,
        shuffle=True,
        num_workers=8)

    # please do not attempt to train without GPU as will take excessively long
    if not use_gpu:
        raise ValueError("Error, requires GPU")
    model = models.densenet121(pretrained=False)

    #==========================================
    # Loading cheXnet weights
    #==========================================
    PATH_TO_MODEL = "pretrained/checkpoint"
    checkpoint = torch.load(PATH_TO_MODEL,
                            map_location=lambda storage, loc: storage)
    model = checkpoint['model']
    del checkpoint
    model.cpu()

    #==========================================
    # Addapting last Layer and adding Metadata
    #==========================================
    # calculating the input of the new layer
    num_ftrs = model.classifier[0].in_features

    # switching the classifier for one with only 2 classes

    model.classifier = nn.Sequential(nn.Linear(num_ftrs, N_LABELS),
                                     nn.Softmax())

    #==========================================
    # Put model on GPU
    #==========================================
    model = model.cuda()

    #==========================================
    # Define criterion, optimizer for training
    #==========================================
    criterion = nn.BCELoss()
    optimizer = optim.SGD(filter(lambda p: p.requires_grad,
                                 model.parameters()),
                          lr=LR,
                          momentum=0.9,
                          weight_decay=WEIGHT_DECAY)
    dataset_sizes = {x: len(transformed_datasets[x]) for x in ['train', 'val']}

    #==========================================
    # Train model
    #==========================================
    model, best_epoch = train_model(model,
                                    criterion,
                                    optimizer,
                                    LR,
                                    num_epochs=NUM_EPOCHS,
                                    dataloaders=dataloaders,
                                    dataset_sizes=dataset_sizes,
                                    weight_decay=WEIGHT_DECAY)

    #==========================================
    # Get preds and AUCs on test fold
    #==========================================
    preds, aucs = E.make_pred_multilabel(data_transforms, model,
                                         PATH_TO_IMAGES)

    return preds, aucs, model
Пример #3
0
def make_pred_multilabel(data_transforms, model, PATH_TO_IMAGES):
    """
    Gives predictions for test fold and calculates AUCs using previously trained model

    Args:
        data_transforms: torchvision transforms to preprocess raw images; same as validation transforms
        model: densenet-121 from torchvision previously fine tuned to training data
        PATH_TO_IMAGES: path at which NIH images can be found
    Returns:
        pred_df: dataframe containing individual predictions and ground truth for each test image
        auc_df: dataframe containing aggregate AUCs by train/test tuples
    """

    # calc preds in batches of 16, can reduce if your GPU has less RAM
    BATCH_SIZE = 16

    # set model to eval mode; required for proper predictions given use of batchnorm
    model.train(False)

    # create dataloader
    dataset = CXR.RSNA_Dataset(path_to_images=PATH_TO_IMAGES,
                               mode="val",
                               transform=data_transforms['val'])
    dataloader = torch.utils.data.DataLoader(dataset,
                                             BATCH_SIZE,
                                             shuffle=False,
                                             num_workers=8)
    size = len(dataset)

    # create empty dfs
    pred_df = pd.DataFrame(columns=["patientId"])
    true_df = pd.DataFrame(columns=["patientId"])

    # iterate over dataloader
    for i, data in enumerate(dataloader):

        inputs, labels, _ = data
        inputs, labels = Variable(inputs.cuda()), Variable(labels.cuda())

        true_labels = labels.cpu().data.numpy()
        batch_size = true_labels.shape

        outputs = model(inputs)
        probs = outputs.cpu().data.numpy()

        # get predictions and true values for each item in batch
        for j in range(0, batch_size[0]):
            thisrow = {}
            truerow = {}
            thisrow["patientId"] = dataset.df.index[BATCH_SIZE * i + j]
            truerow["patientId"] = dataset.df.index[BATCH_SIZE * i + j]

            # iterate over each entry in prediction vector; each corresponds to
            # individual label
            for k in range(len(dataset.PRED_LABEL)):
                thisrow["prob_" + dataset.PRED_LABEL[k]] = probs[j, k]
                truerow[dataset.PRED_LABEL[k]] = true_labels[j, k]

            pred_df = pred_df.append(thisrow, ignore_index=True)
            true_df = true_df.append(truerow, ignore_index=True)

        if (i % 10 == 0):
            print(str(i * BATCH_SIZE))

    auc_df = pd.DataFrame(columns=["label", "auc"])

    # calc AUCs
    for column in true_df:

        if column not in [
                'Atelectasis', 'Cardiomegaly', 'Effusion', 'Infiltration',
                'Mass', 'Nodule', 'Pneumonia', 'Pneumothorax', 'Consolidation',
                'Edema', 'Emphysema', 'Fibrosis', 'Pleural_Thickening',
                'Hernia'
        ]:
            continue
        actual = true_df[column]
        pred = pred_df["prob_" + column]
        thisrow = {}
        thisrow['label'] = column
        thisrow['auc'] = np.nan
        try:
            thisrow['auc'] = sklm.roc_auc_score(actual.as_matrix().astype(int),
                                                pred.as_matrix())
        except BaseException:
            print("can't calculate auc for " + str(column))
        auc_df = auc_df.append(thisrow, ignore_index=True)

    pred_df.to_csv("results/preds.csv", index=False)
    auc_df.to_csv("results/aucs.csv", index=False)
    return pred_df, auc_df