Пример #1
0
def run(img_folder,
        img_extension='dcm',
        img_size=[288, 224],
        img_scale=4095,
        multi_view=False,
        do_featurewise_norm=True,
        featurewise_mean=398.5,
        featurewise_std=627.8,
        batch_size=16,
        samples_per_epoch=160,
        nb_epoch=20,
        balance_classes=.0,
        all_neg_skip=0.,
        pos_cls_weight=1.0,
        nb_init_filter=64,
        init_filter_size=7,
        init_conv_stride=2,
        pool_size=3,
        pool_stride=2,
        weight_decay=.0001,
        alpha=1.,
        l1_ratio=.5,
        inp_dropout=.0,
        hidden_dropout=.0,
        init_lr=.01,
        val_size=.2,
        lr_patience=5,
        es_patience=10,
        resume_from=None,
        net='resnet50',
        load_val_ram=False,
        exam_tsv='./metadata/exams_metadata.tsv',
        img_tsv='./metadata/images_crosswalk.tsv',
        best_model='./modelState/dm_resnet_best_model.h5',
        final_model="NOSAVE"):
    '''Run ResNet training on mammograms using an exam or image list
    Args:
        featurewise_mean, featurewise_std ([float]): they are estimated from 
                1152 x 896 images. Using different sized images give very close
                results. For png, mean=7772, std=12187.
    '''

    # Read some env variables.
    random_seed = int(os.getenv('RANDOM_SEED', 12345))
    nb_worker = int(os.getenv('NUM_CPU_CORES', 4))
    gpu_count = int(os.getenv('NUM_GPU_DEVICES', 1))

    # Setup training and validation data.
    # Load image or exam lists and split them into train and val sets.
    meta_man = DMMetaManager(exam_tsv=exam_tsv,
                             img_tsv=img_tsv,
                             img_folder=img_folder,
                             img_extension=img_extension)
    if multi_view:
        exam_list = meta_man.get_flatten_exam_list()
        exam_train, exam_val = train_test_split(
            exam_list,
            test_size=val_size,
            random_state=random_seed,
            stratify=meta_man.exam_labs(exam_list))
        val_size_ = len(exam_val) * 2  # L and R.
    else:
        img_list, lab_list = meta_man.get_flatten_img_list()
        img_train, img_val, lab_train, lab_val = train_test_split(
            img_list,
            lab_list,
            test_size=val_size,
            random_state=random_seed,
            stratify=lab_list)
        val_size_ = len(img_val)

    # Create image generator.
    img_gen = DMImageDataGenerator(horizontal_flip=True, vertical_flip=True)
    if do_featurewise_norm:
        img_gen.featurewise_center = True
        img_gen.featurewise_std_normalization = True
        img_gen.mean = featurewise_mean
        img_gen.std = featurewise_std
    else:
        img_gen.samplewise_center = True
        img_gen.samplewise_std_normalization = True

    if multi_view:
        train_generator = img_gen.flow_from_exam_list(
            exam_train,
            target_size=(img_size[0], img_size[1]),
            target_scale=img_scale,
            batch_size=batch_size,
            balance_classes=balance_classes,
            all_neg_skip=all_neg_skip,
            shuffle=True,
            seed=random_seed,
            class_mode='binary')
        if load_val_ram:
            val_generator = img_gen.flow_from_exam_list(
                exam_val,
                target_size=(img_size[0], img_size[1]),
                target_scale=img_scale,
                batch_size=val_size_,
                validation_mode=True,
                class_mode='binary')
        else:
            val_generator = img_gen.flow_from_exam_list(
                exam_val,
                target_size=(img_size[0], img_size[1]),
                target_scale=img_scale,
                batch_size=batch_size,
                validation_mode=True,
                class_mode='binary')
    else:
        train_generator = img_gen.flow_from_img_list(
            img_train,
            lab_train,
            target_size=(img_size[0], img_size[1]),
            target_scale=img_scale,
            batch_size=batch_size,
            balance_classes=balance_classes,
            all_neg_skip=all_neg_skip,
            shuffle=True,
            seed=random_seed,
            class_mode='binary')
        if load_val_ram:
            val_generator = img_gen.flow_from_img_list(
                img_val,
                lab_val,
                target_size=(img_size[0], img_size[1]),
                target_scale=img_scale,
                batch_size=val_size_,
                validation_mode=True,
                class_mode='binary')
        else:
            val_generator = img_gen.flow_from_img_list(
                img_val,
                lab_val,
                target_size=(img_size[0], img_size[1]),
                target_scale=img_scale,
                batch_size=batch_size,
                validation_mode=True,
                class_mode='binary')

    # Load validation set into RAM.
    if load_val_ram:
        validation_set = next(val_generator)
        if not multi_view and len(validation_set[0]) != val_size_:
            raise Exception
        elif len(validation_set[0][0]) != val_size_ \
                or len(validation_set[0][1]) != val_size_:
            raise Exception

    # Create model.
    if resume_from is not None:
        model = load_model(resume_from,
                           custom_objects={
                               'sensitivity': DMMetrics.sensitivity,
                               'specificity': DMMetrics.specificity
                           })
    else:
        if multi_view:
            builder = MultiViewResNetBuilder
        else:
            builder = ResNetBuilder
        if net == 'resnet18':
            model = builder.build_resnet_18(
                (1, img_size[0], img_size[1]), 1, nb_init_filter,
                init_filter_size, init_conv_stride, pool_size, pool_stride,
                weight_decay, alpha, l1_ratio, inp_dropout, hidden_dropout)
        elif net == 'resnet34':
            model = builder.build_resnet_34(
                (1, img_size[0], img_size[1]), 1, nb_init_filter,
                init_filter_size, init_conv_stride, pool_size, pool_stride,
                weight_decay, alpha, l1_ratio, inp_dropout, hidden_dropout)
        elif net == 'resnet50':
            model = builder.build_resnet_50(
                (1, img_size[0], img_size[1]), 1, nb_init_filter,
                init_filter_size, init_conv_stride, pool_size, pool_stride,
                weight_decay, alpha, l1_ratio, inp_dropout, hidden_dropout)
        elif net == 'dmresnet14':
            model = builder.build_dm_resnet_14(
                (1, img_size[0], img_size[1]), 1, nb_init_filter,
                init_filter_size, init_conv_stride, pool_size, pool_stride,
                weight_decay, alpha, l1_ratio, inp_dropout, hidden_dropout)
        elif net == 'dmresnet47rb5':
            model = builder.build_dm_resnet_47rb5(
                (1, img_size[0], img_size[1]), 1, nb_init_filter,
                init_filter_size, init_conv_stride, pool_size, pool_stride,
                weight_decay, alpha, l1_ratio, inp_dropout, hidden_dropout)
        elif net == 'dmresnet56rb6':
            model = builder.build_dm_resnet_56rb6(
                (1, img_size[0], img_size[1]), 1, nb_init_filter,
                init_filter_size, init_conv_stride, pool_size, pool_stride,
                weight_decay, alpha, l1_ratio, inp_dropout, hidden_dropout)
        elif net == 'dmresnet65rb7':
            model = builder.build_dm_resnet_65rb7(
                (1, img_size[0], img_size[1]), 1, nb_init_filter,
                init_filter_size, init_conv_stride, pool_size, pool_stride,
                weight_decay, alpha, l1_ratio, inp_dropout, hidden_dropout)
        elif net == 'resnet101':
            model = builder.build_resnet_101(
                (1, img_size[0], img_size[1]), 1, nb_init_filter,
                init_filter_size, init_conv_stride, pool_size, pool_stride,
                weight_decay, alpha, l1_ratio, inp_dropout, hidden_dropout)
        elif net == 'resnet152':
            model = builder.build_resnet_152(
                (1, img_size[0], img_size[1]), 1, nb_init_filter,
                init_filter_size, init_conv_stride, pool_size, pool_stride,
                weight_decay, alpha, l1_ratio, inp_dropout, hidden_dropout)

    if gpu_count > 1:
        model = make_parallel(model, gpu_count)

    # Model training.
    sgd = SGD(lr=init_lr, momentum=0.9, decay=0.0, nesterov=True)
    model.compile(optimizer=sgd,
                  loss='binary_crossentropy',
                  metrics=[DMMetrics.sensitivity, DMMetrics.specificity])
    reduce_lr = ReduceLROnPlateau(monitor='val_loss',
                                  factor=0.1,
                                  patience=lr_patience,
                                  verbose=1)
    early_stopping = EarlyStopping(monitor='val_loss',
                                   patience=es_patience,
                                   verbose=1)
    if load_val_ram:
        auc_checkpointer = DMAucModelCheckpoint(best_model,
                                                validation_set,
                                                batch_size=batch_size)
    else:
        auc_checkpointer = DMAucModelCheckpoint(best_model,
                                                val_generator,
                                                nb_test_samples=val_size_)
    # checkpointer = ModelCheckpoint(
    #     best_model, monitor='val_loss', verbose=1, save_best_only=True)
    hist = model.fit_generator(
        train_generator,
        samples_per_epoch=samples_per_epoch,
        nb_epoch=nb_epoch,
        class_weight={
            0: 1.0,
            1: pos_cls_weight
        },
        validation_data=validation_set if load_val_ram else val_generator,
        nb_val_samples=val_size_,
        callbacks=[reduce_lr, early_stopping, auc_checkpointer],
        nb_worker=nb_worker,
        pickle_safe=True,  # turn on pickle_safe to avoid a strange error.
        verbose=2)

    # Training report.
    min_loss_locs, = np.where(
        hist.history['val_loss'] == min(hist.history['val_loss']))
    best_val_loss = hist.history['val_loss'][min_loss_locs[0]]
    best_val_sensitivity = hist.history['val_sensitivity'][min_loss_locs[0]]
    best_val_specificity = hist.history['val_specificity'][min_loss_locs[0]]
    print "\n==== Training summary ===="
    print "Minimum val loss achieved at epoch:", min_loss_locs[0] + 1
    print "Best val loss:", best_val_loss
    print "Best val sensitivity:", best_val_sensitivity
    print "Best val specificity:", best_val_specificity

    if final_model != "NOSAVE":
        model.save(final_model)

    return hist
Пример #2
0
def run(img_folder, dl_state, img_extension='dcm', 
        img_height=1024, img_scale=4095, val_size=.2, neg_vs_pos_ratio=10., 
        do_featurewise_norm=True, featurewise_mean=873.6, featurewise_std=739.3,
        img_per_batch=2, roi_per_img=32, roi_size=(256, 256), 
        low_int_threshold=.05, blob_min_area=3, 
        blob_min_int=.5, blob_max_int=.85, blob_th_step=10,
        exam_tsv='./metadata/exams_metadata.tsv',
        img_tsv='./metadata/images_crosswalk.tsv',
        train_out='./modelState/meta_prob_train.pkl',
        test_out='./modelState/meta_prob_test.pkl'):
    '''Calculate bag of deep visual words count matrix for all breasts
    '''

    # Read some env variables.
    random_seed = int(os.getenv('RANDOM_SEED', 12345))
    rng = RandomState(random_seed)  # an rng used across board.
    gpu_count = int(os.getenv('NUM_GPU_DEVICES', 1))

    # Load and split image and label lists.
    meta_man = DMMetaManager(exam_tsv=exam_tsv, 
                             img_tsv=img_tsv, 
                             img_folder=img_folder, 
                             img_extension=img_extension)
    subj_list, subj_labs = meta_man.get_subj_labs()
    subj_train, subj_test, labs_train, labs_test = train_test_split(
        subj_list, subj_labs, test_size=val_size, stratify=subj_labs, 
        random_state=random_seed)
    if neg_vs_pos_ratio is not None:
        def subset_subj(subj, labs):
            subj = np.array(subj)
            labs = np.array(labs)
            pos_idx = np.where(labs==1)[0]
            neg_idx = np.where(labs==0)[0]
            nb_neg_desired = int(len(pos_idx)*neg_vs_pos_ratio)
            if nb_neg_desired >= len(neg_idx):
                return subj.tolist()
            else:
                neg_chosen = rng.choice(neg_idx, nb_neg_desired, replace=False)
                subset_idx = np.concatenate([pos_idx, neg_chosen])
                return subj[subset_idx].tolist()

        subj_train = subset_subj(subj_train, labs_train)
        subj_test = subset_subj(subj_test, labs_test)

    # Create image generator for ROIs for representation extraction.
    print "Create an image generator for ROIs"; sys.stdout.flush()
    if do_featurewise_norm:
        imgen = DMImageDataGenerator(
            featurewise_center=True, 
            featurewise_std_normalization=True)
        imgen.mean = featurewise_mean
        imgen.std = featurewise_std
    else:
        imgen = DMImageDataGenerator(
            samplewise_center=True, 
            samplewise_std_normalization=True)

    # Load DL model.
    print "Load DL classification model:", dl_state; sys.stdout.flush()
    dl_model = load_model(
        dl_state, 
        custom_objects={
            'sensitivity': dmm.sensitivity, 
            'specificity': dmm.specificity
        }
    )
    if gpu_count > 1:
        print "Make the model parallel on %d GPUs" % (gpu_count)
        sys.stdout.flush()
        dl_model = make_parallel(dl_model, gpu_count)

    # Read exam lists.
    exam_train = meta_man.get_flatten_exam_list(
        subj_train, flatten_img_list=True)
    exam_test = meta_man.get_flatten_exam_list(
        subj_test, flatten_img_list=True)
    exam_labs_train = np.array(meta_man.exam_labs(exam_train))
    exam_labs_test = np.array(meta_man.exam_labs(exam_test))
    nb_pos_exams_train = (exam_labs_train==1).sum()
    nb_neg_exams_train = (exam_labs_train==0).sum()
    nb_pos_exams_test = (exam_labs_test==1).sum()
    nb_neg_exams_test = (exam_labs_test==0).sum()
    print "Train set - Nb of pos exams: %d, Nb of neg exams: %d" % \
            (nb_pos_exams_train, nb_neg_exams_train)
    print "Test set - Nb of pos exams: %d, Nb of neg exams: %d" % \
            (nb_pos_exams_test, nb_neg_exams_test)

    # Make predictions for exam lists.
    print "Predicting for train exam list"; sys.stdout.flush()
    meta_prob_train = get_exam_pred(
        exam_train, roi_per_img, imgen, 
        target_height=img_height, target_scale=img_scale,
        img_per_batch=img_per_batch, roi_size=roi_size,
        low_int_threshold=low_int_threshold, blob_min_area=blob_min_area, 
        blob_min_int=blob_min_int, blob_max_int=blob_max_int, 
        blob_th_step=blob_th_step, seed=random_seed, 
        dl_model=dl_model)
    print "Length of train prediction list:", len(meta_prob_train)
    sys.stdout.flush()

    print "Predicting for test exam list"; sys.stdout.flush()
    meta_prob_test = get_exam_pred(
        exam_test, roi_per_img, imgen, 
        target_height=img_height, target_scale=img_scale,
        img_per_batch=img_per_batch, roi_size=roi_size,
        low_int_threshold=low_int_threshold, blob_min_area=blob_min_area, 
        blob_min_int=blob_min_int, blob_max_int=blob_max_int, 
        blob_th_step=blob_th_step, seed=random_seed, 
        dl_model=dl_model)
    print "Length of test prediction list:", len(meta_prob_test)
    sys.stdout.flush()

    pickle.dump(meta_prob_train, open(train_out, 'w'))
    pickle.dump(meta_prob_test, open(test_out, 'w'))
    print "Done."
Пример #3
0
def run(img_folder,
        img_extension='png',
        img_size=[288, 224],
        multi_view=False,
        do_featurewise_norm=True,
        featurewise_mean=7772.,
        featurewise_std=12187.,
        batch_size=16,
        samples_per_epoch=160,
        nb_epoch=20,
        val_size=.2,
        balance_classes=0.,
        all_neg_skip=False,
        pos_cls_weight=1.0,
        alpha=1.,
        l1_ratio=.5,
        init_lr=.01,
        lr_patience=2,
        es_patience=4,
        exam_tsv='./metadata/exams_metadata.tsv',
        img_tsv='./metadata/images_crosswalk.tsv',
        dl_state='./modelState/resnet50_288_best_model.h5',
        best_model='./modelState/enet_288_best_model.h5',
        final_model="NOSAVE"):

    # Read some env variables.
    random_seed = int(os.getenv('RANDOM_SEED', 12345))
    nb_worker = int(os.getenv('NUM_CPU_CORES', 4))
    gpu_count = int(os.getenv('NUM_GPU_DEVICES', 1))

    # Setup training and validation data.
    meta_man = DMMetaManager(exam_tsv=exam_tsv,
                             img_tsv=img_tsv,
                             img_folder=img_folder,
                             img_extension=img_extension)

    if multi_view:
        exam_list = meta_man.get_flatten_exam_list()
        exam_train, exam_val = train_test_split(
            exam_list,
            test_size=val_size,
            random_state=random_seed,
            stratify=meta_man.exam_labs(exam_list))
        val_size_ = len(exam_val) * 2  # L and R.
    else:
        img_list, lab_list = meta_man.get_flatten_img_list()
        img_train, img_val, lab_train, lab_val = train_test_split(
            img_list,
            lab_list,
            test_size=val_size,
            random_state=random_seed,
            stratify=lab_list)
        val_size_ = len(img_val)

    img_gen = DMImageDataGenerator(horizontal_flip=True, vertical_flip=True)
    if do_featurewise_norm:
        img_gen.featurewise_center = True
        img_gen.featurewise_std_normalization = True
        img_gen.mean = featurewise_mean
        img_gen.std = featurewise_std
    else:
        img_gen.samplewise_center = True
        img_gen.samplewise_std_normalization = True

    if multi_view:
        train_generator = img_gen.flow_from_exam_list(
            exam_train,
            target_size=(img_size[0], img_size[1]),
            batch_size=batch_size,
            balance_classes=balance_classes,
            all_neg_skip=all_neg_skip,
            shuffle=True,
            seed=random_seed,
            class_mode='binary')
        val_generator = img_gen.flow_from_exam_list(exam_val,
                                                    target_size=(img_size[0],
                                                                 img_size[1]),
                                                    batch_size=batch_size,
                                                    validation_mode=True,
                                                    class_mode='binary')
    else:
        train_generator = img_gen.flow_from_img_list(
            img_train,
            lab_train,
            target_size=(img_size[0], img_size[1]),
            batch_size=batch_size,
            balance_classes=balance_classes,
            all_neg_skip=all_neg_skip,
            shuffle=True,
            seed=random_seed,
            class_mode='binary')
        val_generator = img_gen.flow_from_img_list(img_val,
                                                   lab_val,
                                                   target_size=(img_size[0],
                                                                img_size[1]),
                                                   batch_size=batch_size,
                                                   validation_mode=True,
                                                   class_mode='binary')

    # Deep learning model.
    dl_model = load_model(dl_state,
                          custom_objects={
                              'sensitivity': DMMetrics.sensitivity,
                              'specificity': DMMetrics.specificity
                          })
    # Dummy compilation to turn off the "uncompiled" error when model was run on multi-GPUs.
    # dl_model.compile(optimizer='sgd', loss='binary_crossentropy')
    reprlayer_model = Model(input=dl_model.input,
                            output=dl_model.get_layer(index=-2).output)
    if gpu_count > 1:
        reprlayer_model = make_parallel(reprlayer_model, gpu_count)

    # Setup test data in RAM.
    X_test, y_test = dlrepr_generator(reprlayer_model, val_generator,
                                      val_size_)
    # import pdb; pdb.set_trace()

    # Evaluat DL model on the test data.
    val_generator.reset()
    dl_test_pred = dl_model.predict_generator(val_generator,
                                              val_samples=val_size_,
                                              nb_worker=1,
                                              pickle_safe=False)
    # Set nb_worker to >1 can cause:
    # either inconsistent result when pickle_safe is False,
    #     or broadcasting error when pickle_safe is True.
    # This seems to be a Keras bug!!
    # Further note: the broadcasting error may only happen when val_size_
    # is not divisible by batch_size.
    try:
        dl_auc = roc_auc_score(y_test, dl_test_pred)
        dl_loss = log_loss(y_test, dl_test_pred)
    except ValueError:
        dl_auc = 0.
        dl_loss = np.inf
    print "\nAUROC by the DL model: %.4f, loss: %.4f" % (dl_auc, dl_loss)
    # import pdb; pdb.set_trace()

    # Elastic net training.
    target_classes = np.array([0, 1])
    sgd_clf = SGDClassifier(loss='log',
                            penalty='elasticnet',
                            alpha=alpha,
                            l1_ratio=l1_ratio,
                            verbose=0,
                            n_jobs=nb_worker,
                            learning_rate='constant',
                            eta0=init_lr,
                            random_state=random_seed,
                            class_weight={
                                0: 1.0,
                                1: pos_cls_weight
                            })
    curr_lr = init_lr
    best_epoch = 0
    best_auc = 0.
    min_loss = np.inf
    min_loss_epoch = 0
    for epoch in xrange(nb_epoch):
        samples_seen = 0
        X_list = []
        y_list = []
        epoch_start = time.time()
        while samples_seen < samples_per_epoch:
            X, y = next(train_generator)
            X_repr = reprlayer_model.predict_on_batch(X)
            sgd_clf.partial_fit(X_repr, y, classes=target_classes)
            samples_seen += len(y)
            X_list.append(X_repr)
            y_list.append(y)
        # The training X, y are expected to change for each epoch due to
        # image random sampling and class balancing.
        X_train_epo = np.concatenate(X_list)
        y_train_epo = np.concatenate(y_list)
        # End of epoch summary.
        pred_prob = sgd_clf.predict_proba(X_test)[:, 1]
        train_prob = sgd_clf.predict_proba(X_train_epo)[:, 1]
        try:
            auc = roc_auc_score(y_test, pred_prob)
            crossentropy_loss = log_loss(y_test, pred_prob)
        except ValueError:
            auc = 0.
            crossentropy_loss = np.inf
        try:
            train_loss = log_loss(y_train_epo, train_prob)
        except ValueError:
            train_loss = np.inf
        wei_sparseness = np.mean(sgd_clf.coef_ == 0)
        epoch_span = time.time() - epoch_start
        print ("%ds - Epoch=%d, auc=%.4f, train_loss=%.4f, test_loss=%.4f, "
               "weight sparsity=%.4f") % \
            (epoch_span, epoch + 1, auc, train_loss, crossentropy_loss,
             wei_sparseness)
        # Model checkpoint, reducing learning rate and early stopping.
        if auc > best_auc:
            best_epoch = epoch + 1
            best_auc = auc
            if best_model != "NOSAVE":
                with open(best_model, 'w') as best_state:
                    pickle.dump(sgd_clf, best_state)
        if crossentropy_loss < min_loss:
            min_loss = crossentropy_loss
            min_loss_epoch = epoch + 1
        else:
            if epoch + 1 - min_loss_epoch >= es_patience:
                print 'Early stopping criterion has reached. Stop training.'
                break
            if epoch + 1 - min_loss_epoch >= lr_patience:
                curr_lr *= .1
                sgd_clf.set_params(eta0=curr_lr)
                print "Reducing learning rate to: %s" % (curr_lr)
    # End of training summary
    print ">>> Found best AUROC: %.4f at epoch: %d, saved to: %s <<<" % \
        (best_auc, best_epoch, best_model)
    print ">>> Found best val loss: %.4f at epoch: %d. <<<" % \
        (min_loss, min_loss_epoch)
    #### Save elastic net model!! ####
    if final_model != "NOSAVE":
        with open(final_model, 'w') as final_state:
            pickle.dump(sgd_clf, final_state)
Пример #4
0
def run(img_folder,
        dl_state,
        img_extension='dcm',
        img_height=1024,
        img_scale=255.,
        equalize_hist=False,
        featurewise_center=False,
        featurewise_mean=91.6,
        neg_vs_pos_ratio=1.,
        net='vgg19',
        batch_size=128,
        patch_size=256,
        stride=8,
        exam_tsv='./metadata/exams_metadata.tsv',
        img_tsv='./metadata/images_crosswalk.tsv',
        out='./modelState/prob_heatmap.pkl',
        predicted_subj_file=None,
        add_subjs=500):
    '''Sweep mammograms with trained DL model to create prob heatmaps
    '''
    # Read some env variables.
    random_seed = int(os.getenv('RANDOM_SEED', 12345))
    rng = RandomState(random_seed)  # an rng used across board.
    gpu_count = int(os.getenv('NUM_GPU_DEVICES', 1))

    # Load and split image and label lists.
    meta_man = DMMetaManager(exam_tsv=exam_tsv,
                             img_tsv=img_tsv,
                             img_folder=img_folder,
                             img_extension=img_extension)
    subj_list, subj_labs = meta_man.get_subj_labs()
    subj_labs = np.array(subj_labs)
    print "Found %d subjests" % (len(subj_list))
    print "cancer patients=%d, normal patients=%d" \
            % ((subj_labs==1).sum(), (subj_labs==0).sum())
    if predicted_subj_file is not None:
        predicted_subjs = np.load(predicted_subj_file)
        subj_list = np.setdiff1d(subj_list, predicted_subjs)
        subj_list = subj_list[:add_subjs]
        print "Will predict additional %d subjects" % (len(subj_list))
    elif neg_vs_pos_ratio is not None:
        subj_list, subj_labs = DMMetaManager.subset_subj_list(
            subj_list, subj_labs, neg_vs_pos_ratio, random_seed)
        subj_labs = np.array(subj_labs)
        print "After subsetting, there are %d subjects" % (len(subj_list))
        print "cancer patients=%d, normal patients=%d" \
                % ((subj_labs==1).sum(), (subj_labs==0).sum())

    # Get exam lists.
    # >>>> Debug <<<< #
    # subj_list = subj_list[:2]
    # >>>> Debug <<<< #
    print "Get flattened exam list"
    exam_list = meta_man.get_flatten_exam_list(subj_list, cc_mlo_only=True)
    exam_labs = meta_man.exam_labs(exam_list)
    exam_labs = np.array(exam_labs)
    print "positive exams=%d, negative exams=%d" \
            % ((exam_labs==1).sum(), (exam_labs==0).sum())
    sys.stdout.flush()

    # Load DL model.
    print "Load patch classifier:", dl_state
    sys.stdout.flush()
    dl_model = load_model(dl_state,
                          custom_objects={
                              'sensitivity': dmm.sensitivity,
                              'specificity': dmm.specificity
                          })

    if gpu_count > 1:
        print "Make the model parallel on %d GPUs" % (gpu_count)
        sys.stdout.flush()
        dl_model, _ = make_parallel(dl_model, gpu_count)
        parallelized = True
    else:
        parallelized = False

    # Load preprocess function.
    if featurewise_center:
        preprocess_input = None
    else:
        print "Load preprocess function for net:", net
        if net == 'resnet50':
            from keras.applications.resnet50 import preprocess_input
        elif net == 'vgg16':
            from keras.applications.vgg16 import preprocess_input
        elif net == 'vgg19':
            from keras.applications.vgg19 import preprocess_input
        elif net == 'xception':
            from keras.applications.xception import preprocess_input
        elif net == 'inception':
            from keras.applications.inception_v3 import preprocess_input
        else:
            raise Exception("Pretrained model is not available: " + net)

    # Sweep the whole images and classify patches.
    print "Generate prob heatmaps for exam list"
    sys.stdout.flush()
    heatmap_dat_list = []
    for i, e in enumerate(exam_list):
        dat = (e[0], e[1], {
            'L': {
                'cancer': e[2]['L']['cancer']
            },
            'R': {
                'cancer': e[2]['R']['cancer']
            }
        })
        dat[2]['L']['CC'] = get_prob_heatmap(
            e[2]['L']['CC'],
            img_height,
            img_scale,
            patch_size,
            stride,
            dl_model,
            batch_size,
            featurewise_center=featurewise_center,
            featurewise_mean=featurewise_mean,
            preprocess=preprocess_input,
            parallelized=parallelized,
            equalize_hist=equalize_hist)
        dat[2]['L']['MLO'] = get_prob_heatmap(
            e[2]['L']['MLO'],
            img_height,
            img_scale,
            patch_size,
            stride,
            dl_model,
            batch_size,
            featurewise_center=featurewise_center,
            featurewise_mean=featurewise_mean,
            preprocess=preprocess_input,
            parallelized=parallelized,
            equalize_hist=equalize_hist)
        dat[2]['R']['CC'] = get_prob_heatmap(
            e[2]['R']['CC'],
            img_height,
            img_scale,
            patch_size,
            stride,
            dl_model,
            batch_size,
            featurewise_center=featurewise_center,
            featurewise_mean=featurewise_mean,
            preprocess=preprocess_input,
            parallelized=parallelized,
            equalize_hist=equalize_hist)
        dat[2]['R']['MLO'] = get_prob_heatmap(
            e[2]['R']['MLO'],
            img_height,
            img_scale,
            patch_size,
            stride,
            dl_model,
            batch_size,
            featurewise_center=featurewise_center,
            featurewise_mean=featurewise_mean,
            preprocess=preprocess_input,
            parallelized=parallelized,
            equalize_hist=equalize_hist)
        heatmap_dat_list.append(dat)
        print "processed %d/%d exams" % (i + 1, len(exam_list))
        sys.stdout.flush()
        ### DEBUG ###
        # if i >= 1:
        #    break
        ### DEBUG ###
    print "Done."

    # Save the result.
    print "Saving result to external files.",
    sys.stdout.flush()
    pickle.dump(heatmap_dat_list, open(out, 'w'))
    print "Done."
Пример #5
0
def run(img_folder,
        dl_state,
        img_extension='dcm',
        img_height=1024,
        img_scale=4095,
        val_size=.2,
        neg_vs_pos_ratio=10.,
        do_featurewise_norm=True,
        featurewise_mean=873.6,
        featurewise_std=739.3,
        img_per_batch=2,
        roi_per_img=32,
        roi_size=(256, 256),
        low_int_threshold=.05,
        blob_min_area=3,
        blob_min_int=.5,
        blob_max_int=.85,
        blob_th_step=10,
        layer_name=['flatten_1', 'dense_1'],
        layer_index=None,
        roi_state=None,
        roi_clf_bs=32,
        pc_components=.95,
        pc_whiten=True,
        nb_words=[512],
        km_max_iter=100,
        km_bs=1000,
        km_patience=20,
        km_init=10,
        exam_tsv='./metadata/exams_metadata.tsv',
        img_tsv='./metadata/images_crosswalk.tsv',
        pca_km_states='./modelState/dlrepr_pca_km_models.pkl',
        bow_train_out='./modelState/bow_dat_train.pkl',
        bow_test_out='./modelState/bow_dat_test.pkl'):
    '''Calculate bag of deep visual words count matrix for all breasts
    '''

    # Read some env variables.
    random_seed = int(os.getenv('RANDOM_SEED', 12345))
    rng = RandomState(random_seed)  # an rng used across board.

    # Load and split image and label lists.
    meta_man = DMMetaManager(exam_tsv=exam_tsv,
                             img_tsv=img_tsv,
                             img_folder=img_folder,
                             img_extension=img_extension)
    subj_list, subj_labs = meta_man.get_subj_labs()
    subj_train, subj_test, labs_train, labs_test = train_test_split(
        subj_list,
        subj_labs,
        test_size=val_size,
        stratify=subj_labs,
        random_state=random_seed)
    if neg_vs_pos_ratio is not None:

        def subset_subj(subj, labs):
            subj = np.array(subj)
            labs = np.array(labs)
            pos_idx = np.where(labs == 1)[0]
            neg_idx = np.where(labs == 0)[0]
            nb_neg_desired = int(len(pos_idx) * neg_vs_pos_ratio)
            if nb_neg_desired >= len(neg_idx):
                return subj.tolist()
            else:
                neg_chosen = rng.choice(neg_idx, nb_neg_desired, replace=False)
                subset_idx = np.concatenate([pos_idx, neg_chosen])
                return subj[subset_idx].tolist()

        subj_train = subset_subj(subj_train, labs_train)
        subj_test = subset_subj(subj_test, labs_test)

    img_list, lab_list = meta_man.get_flatten_img_list(subj_train)
    lab_list = np.array(lab_list)
    print "Train set - Nb of positive images: %d, Nb of negative images: %d" \
            % ( (lab_list==1).sum(), (lab_list==0).sum())
    sys.stdout.flush()

    # Create image generator for ROIs for representation extraction.
    print "Create an image generator for ROIs"
    sys.stdout.flush()
    if do_featurewise_norm:
        imgen = DMImageDataGenerator(featurewise_center=True,
                                     featurewise_std_normalization=True)
        imgen.mean = featurewise_mean
        imgen.std = featurewise_std
    else:
        imgen = DMImageDataGenerator(samplewise_center=True,
                                     samplewise_std_normalization=True)

    # Load ROI classifier.
    if roi_state is not None:
        print "Load ROI classifier"
        sys.stdout.flush()
        roi_clf = load_model(roi_state,
                             custom_objects={
                                 'sensitivity': dmm.sensitivity,
                                 'specificity': dmm.specificity
                             })
        graph = tf.get_default_graph()
    else:
        roi_clf = None
        graph = None

    # Create ROI generators for pos and neg images separately.
    print "Create ROI generators for pos and neg images"
    sys.stdout.flush()
    roi_generator = imgen.flow_from_candid_roi(
        img_list,
        target_height=img_height,
        target_scale=img_scale,
        class_mode=None,
        validation_mode=True,
        img_per_batch=img_per_batch,
        roi_per_img=roi_per_img,
        roi_size=roi_size,
        low_int_threshold=low_int_threshold,
        blob_min_area=blob_min_area,
        blob_min_int=blob_min_int,
        blob_max_int=blob_max_int,
        blob_th_step=blob_th_step,
        tf_graph=graph,
        roi_clf=roi_clf,
        clf_bs=roi_clf_bs,
        return_sample_weight=False,
        seed=random_seed)

    # Generate image patches and extract their DL representations.
    print "Load DL representation model"
    sys.stdout.flush()
    dlrepr_model = DLRepr(dl_state,
                          custom_objects={
                              'sensitivity': dmm.sensitivity,
                              'specificity': dmm.specificity
                          },
                          layer_name=layer_name,
                          layer_index=layer_index)
    last_output_size = dlrepr_model.get_output_shape()[-1][-1]
    if last_output_size != 3 and last_output_size != 1:
        raise Exception("The last output must be prob outputs (size=3 or 1)")

    nb_tot_samples = len(img_list) * roi_per_img
    print "Extract ROIs from pos and neg images"
    sys.stdout.flush()
    pred = dlrepr_model.predict_generator(roi_generator,
                                          val_samples=nb_tot_samples)
    for i, d in enumerate(pred):
        print "Shape of representation/output data %d:" % (i), d.shape
    sys.stdout.flush()

    # Flatten feature maps, e.g. an 8x8 feature map will become a 64-d vector.
    pred = [d.reshape((-1, d.shape[-1])) for d in pred]
    for i, d in enumerate(pred):
        print "Shape of flattened data %d:" % (i), d.shape
    sys.stdout.flush()

    # Split representations and prob outputs.
    dl_repr = pred[0]
    prob_out = pred[1]
    if prob_out.shape[1] == 3:
        prob_out = prob_out[:, 1]  # pos class.
    prob_out = prob_out.reshape((len(img_list), -1))
    print "Reshape prob output to:", prob_out.shape
    sys.stdout.flush()

    # Use PCA to reduce dimension of the representation data.
    if pc_components is not None:
        print "Start PCA dimension reduction on DL representation"
        sys.stdout.flush()
        pca = PCA(n_components=pc_components, whiten=pc_whiten)
        pca.fit(dl_repr)
        print "Nb of PCA components:", pca.n_components_
        print "Total explained variance ratio: %.4f" % \
                (pca.explained_variance_ratio_.sum())
        dl_repr_pca = pca.transform(dl_repr)
        print "Shape of transformed representation data:", dl_repr_pca.shape
        sys.stdout.flush()
    else:
        pca = None

    # Use K-means to create a codebook for deep visual words.
    print "Start K-means training on DL representation"
    sys.stdout.flush()
    clf_list = []
    clust_list = []
    # Shuffling indices for mini-batches learning.
    perm_idx = rng.permutation(len(dl_repr))
    for n in nb_words:
        print "Train K-means with %d cluster centers" % (n)
        sys.stdout.flush()
        clf = MiniBatchKMeans(n_clusters=n,
                              init='k-means++',
                              max_iter=km_max_iter,
                              batch_size=km_bs,
                              compute_labels=True,
                              random_state=random_seed,
                              tol=0.0,
                              max_no_improvement=km_patience,
                              init_size=None,
                              n_init=km_init,
                              reassignment_ratio=0.01,
                              verbose=0)
        clf.fit(dl_repr[perm_idx])
        clf_list.append(clf)
        clust = np.zeros_like(clf.labels_)
        clust[perm_idx] = clf.labels_
        clust = clust.reshape((len(img_list), -1))
        clust_list.append(clust)

    if pca is not None:
        print "Start K-means training on transformed representation"
        sys.stdout.flush()
        clf_list_pca = []
        clust_list_pca = []
        # Shuffling indices for mini-batches learning.
        perm_idx = rng.permutation(len(dl_repr_pca))
        for n in nb_words:
            print "Train K-means with %d cluster centers" % (n)
            sys.stdout.flush()
            clf = MiniBatchKMeans(n_clusters=n,
                                  init='k-means++',
                                  max_iter=km_max_iter,
                                  batch_size=km_bs,
                                  compute_labels=True,
                                  random_state=random_seed,
                                  tol=0.0,
                                  max_no_improvement=km_patience,
                                  init_size=None,
                                  n_init=km_init,
                                  reassignment_ratio=0.01,
                                  verbose=0)
            clf.fit(dl_repr_pca[perm_idx])
            clf_list_pca.append(clf)
            clust = np.zeros_like(clf.labels_)
            clust[perm_idx] = clf.labels_
            clust = clust.reshape((len(img_list), -1))
            clust_list_pca.append(clust)

    # Read exam lists.
    exam_train = meta_man.get_flatten_exam_list(subj_train,
                                                flatten_img_list=True)
    exam_test = meta_man.get_flatten_exam_list(subj_test,
                                               flatten_img_list=True)
    exam_labs_train = np.array(meta_man.exam_labs(exam_train))
    exam_labs_test = np.array(meta_man.exam_labs(exam_test))
    nb_pos_exams_train = (exam_labs_train == 1).sum()
    nb_neg_exams_train = (exam_labs_train == 0).sum()
    nb_pos_exams_test = (exam_labs_test == 1).sum()
    nb_neg_exams_test = (exam_labs_test == 0).sum()
    print "Train set - Nb of pos exams: %d, Nb of neg exams: %d" % \
            (nb_pos_exams_train, nb_neg_exams_train)
    print "Test set - Nb of pos exams: %d, Nb of neg exams: %d" % \
            (nb_pos_exams_test, nb_neg_exams_test)

    # Do BoW counts for each breast.
    print "BoW counting for train exam list"
    sys.stdout.flush()
    bow_dat_train = get_exam_bow_dat(exam_train,
                                     nb_words,
                                     roi_per_img,
                                     img_list=img_list,
                                     prob_out=prob_out,
                                     clust_list=clust_list)
    for i, d in enumerate(bow_dat_train[1]):
        print "Shape of train BoW matrix %d:" % (i), d.shape
    sys.stdout.flush()

    print "BoW counting for test exam list"
    sys.stdout.flush()
    bow_dat_test = get_exam_bow_dat(exam_test,
                                    nb_words,
                                    roi_per_img,
                                    imgen=imgen,
                                    clf_list=clf_list,
                                    transformer=None,
                                    target_height=img_height,
                                    target_scale=img_scale,
                                    img_per_batch=img_per_batch,
                                    roi_size=roi_size,
                                    low_int_threshold=low_int_threshold,
                                    blob_min_area=blob_min_area,
                                    blob_min_int=blob_min_int,
                                    blob_max_int=blob_max_int,
                                    blob_th_step=blob_th_step,
                                    seed=random_seed,
                                    dlrepr_model=dlrepr_model)
    for i, d in enumerate(bow_dat_test[1]):
        print "Shape of test BoW matrix %d:" % (i), d.shape
    sys.stdout.flush()

    if pca is not None:
        print "== Do same BoW counting on PCA transformed data =="
        print "BoW counting for train exam list"
        sys.stdout.flush()
        bow_dat_train_pca = get_exam_bow_dat(exam_train,
                                             nb_words,
                                             roi_per_img,
                                             img_list=img_list,
                                             prob_out=prob_out,
                                             clust_list=clust_list_pca)
        for i, d in enumerate(bow_dat_train_pca[1]):
            print "Shape of train BoW matrix %d:" % (i), d.shape
        sys.stdout.flush()

        print "BoW counting for test exam list"
        sys.stdout.flush()
        bow_dat_test_pca = get_exam_bow_dat(
            exam_test,
            nb_words,
            roi_per_img,
            imgen=imgen,
            clf_list=clf_list_pca,
            transformer=pca,
            target_height=img_height,
            target_scale=img_scale,
            img_per_batch=img_per_batch,
            roi_size=roi_size,
            low_int_threshold=low_int_threshold,
            blob_min_area=blob_min_area,
            blob_min_int=blob_min_int,
            blob_max_int=blob_max_int,
            blob_th_step=blob_th_step,
            seed=random_seed,
            dlrepr_model=dlrepr_model)
        for i, d in enumerate(bow_dat_test_pca[1]):
            print "Shape of test BoW matrix %d:" % (i), d.shape
        sys.stdout.flush()

    # Save K-means model and BoW count data.
    if pca is None:
        pickle.dump(clf_list, open(pca_km_states, 'w'))
        pickle.dump(bow_dat_train, open(bow_train_out, 'w'))
        pickle.dump(bow_dat_test, open(bow_test_out, 'w'))
    else:
        pickle.dump((pca, clf_list), open(pca_km_states, 'w'))
        pickle.dump((bow_dat_train, bow_dat_train_pca),
                    open(bow_train_out, 'w'))
        pickle.dump((bow_dat_test, bow_dat_test_pca), open(bow_test_out, 'w'))

    print "Done."
Пример #6
0
def run(img_folder,
        dl_state,
        clf_info_state,
        img_extension='dcm',
        img_height=4096,
        img_scale=255.,
        equalize_hist=False,
        featurewise_center=False,
        featurewise_mean=91.6,
        net='resnet50',
        batch_size=64,
        patch_size=256,
        stride=64,
        exam_tsv='./metadata/exams_metadata.tsv',
        img_tsv='./metadata/images_crosswalk.tsv',
        validation_mode=False,
        use_mean=False,
        out_pred='./output/predictions.tsv',
        progress='./progress.txt'):
    '''Run SC1 inference using prob heatmaps
    '''
    # Read some env variables.
    random_seed = int(os.getenv('RANDOM_SEED', 12345))
    rng = np.random.RandomState(random_seed)  # an rng used across board.
    gpu_count = int(os.getenv('NUM_GPU_DEVICES', 1))

    # Setup data generator for inference.
    meta_man = DMMetaManager(img_tsv=img_tsv,
                             exam_tsv=exam_tsv,
                             img_folder=img_folder,
                             img_extension=img_extension)
    if validation_mode:
        exam_list = meta_man.get_flatten_exam_list(cc_mlo_only=True)
        exam_labs = meta_man.exam_labs(exam_list)
        exam_labs = np.array(exam_labs)
        print "positive exams=%d, negative exams=%d" \
                % ((exam_labs==1).sum(), (exam_labs==0).sum())
        sys.stdout.flush()
    else:
        exam_list = meta_man.get_last_exam_list(cc_mlo_only=True)
        exam_labs = None

    # Load DL model and classifiers.
    print "Load patch classifier:", dl_state
    sys.stdout.flush()
    dl_model = load_model(dl_state)
    if gpu_count > 1:
        print "Make the model parallel on %d GPUs" % (gpu_count)
        sys.stdout.flush()
        dl_model, _ = make_parallel(dl_model, gpu_count)
        parallelized = True
    else:
        parallelized = False
    feature_name, nb_phm, cutoff_list, k, clf_list = \
            pickle.load(open(clf_info_state))

    # Load preprocess function.
    if featurewise_center:
        preprocess_input = None
    else:
        print "Load preprocess function for net:", net
        if net == 'resnet50':
            from keras.applications.resnet50 import preprocess_input
        elif net == 'vgg16':
            from keras.applications.vgg16 import preprocess_input
        elif net == 'vgg19':
            from keras.applications.vgg19 import preprocess_input
        elif net == 'xception':
            from keras.applications.xception import preprocess_input
        elif net == 'inception':
            from keras.applications.inception_v3 import preprocess_input
        else:
            raise Exception("Pretrained model is not available: " + net)

    # Print header.
    fout = open(out_pred, 'w')
    if validation_mode:
        fout.write(dminfer.INFER_HEADER_VAL)
    else:
        fout.write(dminfer.INFER_HEADER)

    print "Start inference for exam list"
    sys.stdout.flush()
    for i, e in enumerate(exam_list):
        ### DEBUG ###
        # if i >= 3:
        #    break
        ### DEBUG ###
        subj = e[0]
        exam_idx = e[1]
        if validation_mode:
            left_cancer = e[2]['L']['cancer']
            right_cancer = e[2]['R']['cancer']
            left_cancer = 0 if np.isnan(left_cancer) else left_cancer
            right_cancer = 0 if np.isnan(right_cancer) else right_cancer
        try:
            left_cc_phms = get_prob_heatmap(
                e[2]['L']['CC'],
                img_height,
                img_scale,
                patch_size,
                stride,
                dl_model,
                batch_size,
                featurewise_center=featurewise_center,
                featurewise_mean=featurewise_mean,
                preprocess=preprocess_input,
                parallelized=parallelized,
                equalize_hist=equalize_hist)
        except:
            left_cc_phms = [None]
        try:
            left_mlo_phms = get_prob_heatmap(
                e[2]['L']['MLO'],
                img_height,
                img_scale,
                patch_size,
                stride,
                dl_model,
                batch_size,
                featurewise_center=featurewise_center,
                featurewise_mean=featurewise_mean,
                preprocess=preprocess_input,
                parallelized=parallelized,
                equalize_hist=equalize_hist)
        except:
            left_mlo_phms = [None]
        try:
            right_cc_phms = get_prob_heatmap(
                e[2]['R']['CC'],
                img_height,
                img_scale,
                patch_size,
                stride,
                dl_model,
                batch_size,
                featurewise_center=featurewise_center,
                featurewise_mean=featurewise_mean,
                preprocess=preprocess_input,
                parallelized=parallelized,
                equalize_hist=equalize_hist)
        except:
            right_cc_phms = [None]
        try:
            right_mlo_phms = get_prob_heatmap(
                e[2]['R']['MLO'],
                img_height,
                img_scale,
                patch_size,
                stride,
                dl_model,
                batch_size,
                featurewise_center=featurewise_center,
                featurewise_mean=featurewise_mean,
                preprocess=preprocess_input,
                parallelized=parallelized,
                equalize_hist=equalize_hist)
        except:
            right_mlo_phms = [None]
        try:
            left_pred = dminfer.make_pred_case(left_cc_phms,
                                               left_mlo_phms,
                                               feature_name,
                                               cutoff_list,
                                               clf_list,
                                               k=k,
                                               nb_phm=nb_phm,
                                               use_mean=use_mean)
        except:
            print "Exception in predicting left breast" + \
                  " for subj:", subj, "exam:", exam_idx
            sys.stdout.flush()
            left_pred = 0.
        try:
            right_pred = dminfer.make_pred_case(right_cc_phms,
                                                right_mlo_phms,
                                                feature_name,
                                                cutoff_list,
                                                clf_list,
                                                k=k,
                                                nb_phm=nb_phm,
                                                use_mean=use_mean)
        except:
            print "Exception in predicting right breast" + \
                  " for subj:", subj, "exam:", exam_idx
            sys.stdout.flush()
            right_pred = 0.
        if validation_mode:
            fout.write("%s\t%s\tL\t%f\t%f\n" % \
                       (str(subj), str(exam_idx), left_pred, left_cancer))
            fout.write("%s\t%s\tR\t%f\t%f\n" % \
                       (str(subj), str(exam_idx), right_pred, right_cancer))
            fout.flush()
        else:
            fout.write("%s\tL\t%f\n" % (str(subj), left_pred))
            fout.write("%s\tR\t%f\n" % (str(subj), right_pred))
            fout.flush()
        print "processed %d/%d exams" % (i + 1, len(exam_list))
        sys.stdout.flush()
        with open(progress, 'w') as fpro:
            fpro.write("%f\n" % ((i + 1.) / len(exam_list)))
    print "Done."
    fout.close()