def run(img_folder, img_extension='dcm', img_size=[288, 224], img_scale=4095, multi_view=False, do_featurewise_norm=True, featurewise_mean=398.5, featurewise_std=627.8, batch_size=16, samples_per_epoch=160, nb_epoch=20, balance_classes=.0, all_neg_skip=0., pos_cls_weight=1.0, nb_init_filter=64, init_filter_size=7, init_conv_stride=2, pool_size=3, pool_stride=2, weight_decay=.0001, alpha=1., l1_ratio=.5, inp_dropout=.0, hidden_dropout=.0, init_lr=.01, val_size=.2, lr_patience=5, es_patience=10, resume_from=None, net='resnet50', load_val_ram=False, exam_tsv='./metadata/exams_metadata.tsv', img_tsv='./metadata/images_crosswalk.tsv', best_model='./modelState/dm_resnet_best_model.h5', final_model="NOSAVE"): '''Run ResNet training on mammograms using an exam or image list Args: featurewise_mean, featurewise_std ([float]): they are estimated from 1152 x 896 images. Using different sized images give very close results. For png, mean=7772, std=12187. ''' # Read some env variables. random_seed = int(os.getenv('RANDOM_SEED', 12345)) nb_worker = int(os.getenv('NUM_CPU_CORES', 4)) gpu_count = int(os.getenv('NUM_GPU_DEVICES', 1)) # Setup training and validation data. # Load image or exam lists and split them into train and val sets. meta_man = DMMetaManager(exam_tsv=exam_tsv, img_tsv=img_tsv, img_folder=img_folder, img_extension=img_extension) if multi_view: exam_list = meta_man.get_flatten_exam_list() exam_train, exam_val = train_test_split( exam_list, test_size=val_size, random_state=random_seed, stratify=meta_man.exam_labs(exam_list)) val_size_ = len(exam_val) * 2 # L and R. else: img_list, lab_list = meta_man.get_flatten_img_list() img_train, img_val, lab_train, lab_val = train_test_split( img_list, lab_list, test_size=val_size, random_state=random_seed, stratify=lab_list) val_size_ = len(img_val) # Create image generator. img_gen = DMImageDataGenerator(horizontal_flip=True, vertical_flip=True) if do_featurewise_norm: img_gen.featurewise_center = True img_gen.featurewise_std_normalization = True img_gen.mean = featurewise_mean img_gen.std = featurewise_std else: img_gen.samplewise_center = True img_gen.samplewise_std_normalization = True if multi_view: train_generator = img_gen.flow_from_exam_list( exam_train, target_size=(img_size[0], img_size[1]), target_scale=img_scale, batch_size=batch_size, balance_classes=balance_classes, all_neg_skip=all_neg_skip, shuffle=True, seed=random_seed, class_mode='binary') if load_val_ram: val_generator = img_gen.flow_from_exam_list( exam_val, target_size=(img_size[0], img_size[1]), target_scale=img_scale, batch_size=val_size_, validation_mode=True, class_mode='binary') else: val_generator = img_gen.flow_from_exam_list( exam_val, target_size=(img_size[0], img_size[1]), target_scale=img_scale, batch_size=batch_size, validation_mode=True, class_mode='binary') else: train_generator = img_gen.flow_from_img_list( img_train, lab_train, target_size=(img_size[0], img_size[1]), target_scale=img_scale, batch_size=batch_size, balance_classes=balance_classes, all_neg_skip=all_neg_skip, shuffle=True, seed=random_seed, class_mode='binary') if load_val_ram: val_generator = img_gen.flow_from_img_list( img_val, lab_val, target_size=(img_size[0], img_size[1]), target_scale=img_scale, batch_size=val_size_, validation_mode=True, class_mode='binary') else: val_generator = img_gen.flow_from_img_list( img_val, lab_val, target_size=(img_size[0], img_size[1]), target_scale=img_scale, batch_size=batch_size, validation_mode=True, class_mode='binary') # Load validation set into RAM. if load_val_ram: validation_set = next(val_generator) if not multi_view and len(validation_set[0]) != val_size_: raise Exception elif len(validation_set[0][0]) != val_size_ \ or len(validation_set[0][1]) != val_size_: raise Exception # Create model. if resume_from is not None: model = load_model(resume_from, custom_objects={ 'sensitivity': DMMetrics.sensitivity, 'specificity': DMMetrics.specificity }) else: if multi_view: builder = MultiViewResNetBuilder else: builder = ResNetBuilder if net == 'resnet18': model = builder.build_resnet_18( (1, img_size[0], img_size[1]), 1, nb_init_filter, init_filter_size, init_conv_stride, pool_size, pool_stride, weight_decay, alpha, l1_ratio, inp_dropout, hidden_dropout) elif net == 'resnet34': model = builder.build_resnet_34( (1, img_size[0], img_size[1]), 1, nb_init_filter, init_filter_size, init_conv_stride, pool_size, pool_stride, weight_decay, alpha, l1_ratio, inp_dropout, hidden_dropout) elif net == 'resnet50': model = builder.build_resnet_50( (1, img_size[0], img_size[1]), 1, nb_init_filter, init_filter_size, init_conv_stride, pool_size, pool_stride, weight_decay, alpha, l1_ratio, inp_dropout, hidden_dropout) elif net == 'dmresnet14': model = builder.build_dm_resnet_14( (1, img_size[0], img_size[1]), 1, nb_init_filter, init_filter_size, init_conv_stride, pool_size, pool_stride, weight_decay, alpha, l1_ratio, inp_dropout, hidden_dropout) elif net == 'dmresnet47rb5': model = builder.build_dm_resnet_47rb5( (1, img_size[0], img_size[1]), 1, nb_init_filter, init_filter_size, init_conv_stride, pool_size, pool_stride, weight_decay, alpha, l1_ratio, inp_dropout, hidden_dropout) elif net == 'dmresnet56rb6': model = builder.build_dm_resnet_56rb6( (1, img_size[0], img_size[1]), 1, nb_init_filter, init_filter_size, init_conv_stride, pool_size, pool_stride, weight_decay, alpha, l1_ratio, inp_dropout, hidden_dropout) elif net == 'dmresnet65rb7': model = builder.build_dm_resnet_65rb7( (1, img_size[0], img_size[1]), 1, nb_init_filter, init_filter_size, init_conv_stride, pool_size, pool_stride, weight_decay, alpha, l1_ratio, inp_dropout, hidden_dropout) elif net == 'resnet101': model = builder.build_resnet_101( (1, img_size[0], img_size[1]), 1, nb_init_filter, init_filter_size, init_conv_stride, pool_size, pool_stride, weight_decay, alpha, l1_ratio, inp_dropout, hidden_dropout) elif net == 'resnet152': model = builder.build_resnet_152( (1, img_size[0], img_size[1]), 1, nb_init_filter, init_filter_size, init_conv_stride, pool_size, pool_stride, weight_decay, alpha, l1_ratio, inp_dropout, hidden_dropout) if gpu_count > 1: model = make_parallel(model, gpu_count) # Model training. sgd = SGD(lr=init_lr, momentum=0.9, decay=0.0, nesterov=True) model.compile(optimizer=sgd, loss='binary_crossentropy', metrics=[DMMetrics.sensitivity, DMMetrics.specificity]) reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=lr_patience, verbose=1) early_stopping = EarlyStopping(monitor='val_loss', patience=es_patience, verbose=1) if load_val_ram: auc_checkpointer = DMAucModelCheckpoint(best_model, validation_set, batch_size=batch_size) else: auc_checkpointer = DMAucModelCheckpoint(best_model, val_generator, nb_test_samples=val_size_) # checkpointer = ModelCheckpoint( # best_model, monitor='val_loss', verbose=1, save_best_only=True) hist = model.fit_generator( train_generator, samples_per_epoch=samples_per_epoch, nb_epoch=nb_epoch, class_weight={ 0: 1.0, 1: pos_cls_weight }, validation_data=validation_set if load_val_ram else val_generator, nb_val_samples=val_size_, callbacks=[reduce_lr, early_stopping, auc_checkpointer], nb_worker=nb_worker, pickle_safe=True, # turn on pickle_safe to avoid a strange error. verbose=2) # Training report. min_loss_locs, = np.where( hist.history['val_loss'] == min(hist.history['val_loss'])) best_val_loss = hist.history['val_loss'][min_loss_locs[0]] best_val_sensitivity = hist.history['val_sensitivity'][min_loss_locs[0]] best_val_specificity = hist.history['val_specificity'][min_loss_locs[0]] print "\n==== Training summary ====" print "Minimum val loss achieved at epoch:", min_loss_locs[0] + 1 print "Best val loss:", best_val_loss print "Best val sensitivity:", best_val_sensitivity print "Best val specificity:", best_val_specificity if final_model != "NOSAVE": model.save(final_model) return hist
def run(img_folder, dl_state, img_extension='dcm', img_height=1024, img_scale=4095, val_size=.2, neg_vs_pos_ratio=10., do_featurewise_norm=True, featurewise_mean=873.6, featurewise_std=739.3, img_per_batch=2, roi_per_img=32, roi_size=(256, 256), low_int_threshold=.05, blob_min_area=3, blob_min_int=.5, blob_max_int=.85, blob_th_step=10, exam_tsv='./metadata/exams_metadata.tsv', img_tsv='./metadata/images_crosswalk.tsv', train_out='./modelState/meta_prob_train.pkl', test_out='./modelState/meta_prob_test.pkl'): '''Calculate bag of deep visual words count matrix for all breasts ''' # Read some env variables. random_seed = int(os.getenv('RANDOM_SEED', 12345)) rng = RandomState(random_seed) # an rng used across board. gpu_count = int(os.getenv('NUM_GPU_DEVICES', 1)) # Load and split image and label lists. meta_man = DMMetaManager(exam_tsv=exam_tsv, img_tsv=img_tsv, img_folder=img_folder, img_extension=img_extension) subj_list, subj_labs = meta_man.get_subj_labs() subj_train, subj_test, labs_train, labs_test = train_test_split( subj_list, subj_labs, test_size=val_size, stratify=subj_labs, random_state=random_seed) if neg_vs_pos_ratio is not None: def subset_subj(subj, labs): subj = np.array(subj) labs = np.array(labs) pos_idx = np.where(labs==1)[0] neg_idx = np.where(labs==0)[0] nb_neg_desired = int(len(pos_idx)*neg_vs_pos_ratio) if nb_neg_desired >= len(neg_idx): return subj.tolist() else: neg_chosen = rng.choice(neg_idx, nb_neg_desired, replace=False) subset_idx = np.concatenate([pos_idx, neg_chosen]) return subj[subset_idx].tolist() subj_train = subset_subj(subj_train, labs_train) subj_test = subset_subj(subj_test, labs_test) # Create image generator for ROIs for representation extraction. print "Create an image generator for ROIs"; sys.stdout.flush() if do_featurewise_norm: imgen = DMImageDataGenerator( featurewise_center=True, featurewise_std_normalization=True) imgen.mean = featurewise_mean imgen.std = featurewise_std else: imgen = DMImageDataGenerator( samplewise_center=True, samplewise_std_normalization=True) # Load DL model. print "Load DL classification model:", dl_state; sys.stdout.flush() dl_model = load_model( dl_state, custom_objects={ 'sensitivity': dmm.sensitivity, 'specificity': dmm.specificity } ) if gpu_count > 1: print "Make the model parallel on %d GPUs" % (gpu_count) sys.stdout.flush() dl_model = make_parallel(dl_model, gpu_count) # Read exam lists. exam_train = meta_man.get_flatten_exam_list( subj_train, flatten_img_list=True) exam_test = meta_man.get_flatten_exam_list( subj_test, flatten_img_list=True) exam_labs_train = np.array(meta_man.exam_labs(exam_train)) exam_labs_test = np.array(meta_man.exam_labs(exam_test)) nb_pos_exams_train = (exam_labs_train==1).sum() nb_neg_exams_train = (exam_labs_train==0).sum() nb_pos_exams_test = (exam_labs_test==1).sum() nb_neg_exams_test = (exam_labs_test==0).sum() print "Train set - Nb of pos exams: %d, Nb of neg exams: %d" % \ (nb_pos_exams_train, nb_neg_exams_train) print "Test set - Nb of pos exams: %d, Nb of neg exams: %d" % \ (nb_pos_exams_test, nb_neg_exams_test) # Make predictions for exam lists. print "Predicting for train exam list"; sys.stdout.flush() meta_prob_train = get_exam_pred( exam_train, roi_per_img, imgen, target_height=img_height, target_scale=img_scale, img_per_batch=img_per_batch, roi_size=roi_size, low_int_threshold=low_int_threshold, blob_min_area=blob_min_area, blob_min_int=blob_min_int, blob_max_int=blob_max_int, blob_th_step=blob_th_step, seed=random_seed, dl_model=dl_model) print "Length of train prediction list:", len(meta_prob_train) sys.stdout.flush() print "Predicting for test exam list"; sys.stdout.flush() meta_prob_test = get_exam_pred( exam_test, roi_per_img, imgen, target_height=img_height, target_scale=img_scale, img_per_batch=img_per_batch, roi_size=roi_size, low_int_threshold=low_int_threshold, blob_min_area=blob_min_area, blob_min_int=blob_min_int, blob_max_int=blob_max_int, blob_th_step=blob_th_step, seed=random_seed, dl_model=dl_model) print "Length of test prediction list:", len(meta_prob_test) sys.stdout.flush() pickle.dump(meta_prob_train, open(train_out, 'w')) pickle.dump(meta_prob_test, open(test_out, 'w')) print "Done."
def run(img_folder, img_extension='png', img_size=[288, 224], multi_view=False, do_featurewise_norm=True, featurewise_mean=7772., featurewise_std=12187., batch_size=16, samples_per_epoch=160, nb_epoch=20, val_size=.2, balance_classes=0., all_neg_skip=False, pos_cls_weight=1.0, alpha=1., l1_ratio=.5, init_lr=.01, lr_patience=2, es_patience=4, exam_tsv='./metadata/exams_metadata.tsv', img_tsv='./metadata/images_crosswalk.tsv', dl_state='./modelState/resnet50_288_best_model.h5', best_model='./modelState/enet_288_best_model.h5', final_model="NOSAVE"): # Read some env variables. random_seed = int(os.getenv('RANDOM_SEED', 12345)) nb_worker = int(os.getenv('NUM_CPU_CORES', 4)) gpu_count = int(os.getenv('NUM_GPU_DEVICES', 1)) # Setup training and validation data. meta_man = DMMetaManager(exam_tsv=exam_tsv, img_tsv=img_tsv, img_folder=img_folder, img_extension=img_extension) if multi_view: exam_list = meta_man.get_flatten_exam_list() exam_train, exam_val = train_test_split( exam_list, test_size=val_size, random_state=random_seed, stratify=meta_man.exam_labs(exam_list)) val_size_ = len(exam_val) * 2 # L and R. else: img_list, lab_list = meta_man.get_flatten_img_list() img_train, img_val, lab_train, lab_val = train_test_split( img_list, lab_list, test_size=val_size, random_state=random_seed, stratify=lab_list) val_size_ = len(img_val) img_gen = DMImageDataGenerator(horizontal_flip=True, vertical_flip=True) if do_featurewise_norm: img_gen.featurewise_center = True img_gen.featurewise_std_normalization = True img_gen.mean = featurewise_mean img_gen.std = featurewise_std else: img_gen.samplewise_center = True img_gen.samplewise_std_normalization = True if multi_view: train_generator = img_gen.flow_from_exam_list( exam_train, target_size=(img_size[0], img_size[1]), batch_size=batch_size, balance_classes=balance_classes, all_neg_skip=all_neg_skip, shuffle=True, seed=random_seed, class_mode='binary') val_generator = img_gen.flow_from_exam_list(exam_val, target_size=(img_size[0], img_size[1]), batch_size=batch_size, validation_mode=True, class_mode='binary') else: train_generator = img_gen.flow_from_img_list( img_train, lab_train, target_size=(img_size[0], img_size[1]), batch_size=batch_size, balance_classes=balance_classes, all_neg_skip=all_neg_skip, shuffle=True, seed=random_seed, class_mode='binary') val_generator = img_gen.flow_from_img_list(img_val, lab_val, target_size=(img_size[0], img_size[1]), batch_size=batch_size, validation_mode=True, class_mode='binary') # Deep learning model. dl_model = load_model(dl_state, custom_objects={ 'sensitivity': DMMetrics.sensitivity, 'specificity': DMMetrics.specificity }) # Dummy compilation to turn off the "uncompiled" error when model was run on multi-GPUs. # dl_model.compile(optimizer='sgd', loss='binary_crossentropy') reprlayer_model = Model(input=dl_model.input, output=dl_model.get_layer(index=-2).output) if gpu_count > 1: reprlayer_model = make_parallel(reprlayer_model, gpu_count) # Setup test data in RAM. X_test, y_test = dlrepr_generator(reprlayer_model, val_generator, val_size_) # import pdb; pdb.set_trace() # Evaluat DL model on the test data. val_generator.reset() dl_test_pred = dl_model.predict_generator(val_generator, val_samples=val_size_, nb_worker=1, pickle_safe=False) # Set nb_worker to >1 can cause: # either inconsistent result when pickle_safe is False, # or broadcasting error when pickle_safe is True. # This seems to be a Keras bug!! # Further note: the broadcasting error may only happen when val_size_ # is not divisible by batch_size. try: dl_auc = roc_auc_score(y_test, dl_test_pred) dl_loss = log_loss(y_test, dl_test_pred) except ValueError: dl_auc = 0. dl_loss = np.inf print "\nAUROC by the DL model: %.4f, loss: %.4f" % (dl_auc, dl_loss) # import pdb; pdb.set_trace() # Elastic net training. target_classes = np.array([0, 1]) sgd_clf = SGDClassifier(loss='log', penalty='elasticnet', alpha=alpha, l1_ratio=l1_ratio, verbose=0, n_jobs=nb_worker, learning_rate='constant', eta0=init_lr, random_state=random_seed, class_weight={ 0: 1.0, 1: pos_cls_weight }) curr_lr = init_lr best_epoch = 0 best_auc = 0. min_loss = np.inf min_loss_epoch = 0 for epoch in xrange(nb_epoch): samples_seen = 0 X_list = [] y_list = [] epoch_start = time.time() while samples_seen < samples_per_epoch: X, y = next(train_generator) X_repr = reprlayer_model.predict_on_batch(X) sgd_clf.partial_fit(X_repr, y, classes=target_classes) samples_seen += len(y) X_list.append(X_repr) y_list.append(y) # The training X, y are expected to change for each epoch due to # image random sampling and class balancing. X_train_epo = np.concatenate(X_list) y_train_epo = np.concatenate(y_list) # End of epoch summary. pred_prob = sgd_clf.predict_proba(X_test)[:, 1] train_prob = sgd_clf.predict_proba(X_train_epo)[:, 1] try: auc = roc_auc_score(y_test, pred_prob) crossentropy_loss = log_loss(y_test, pred_prob) except ValueError: auc = 0. crossentropy_loss = np.inf try: train_loss = log_loss(y_train_epo, train_prob) except ValueError: train_loss = np.inf wei_sparseness = np.mean(sgd_clf.coef_ == 0) epoch_span = time.time() - epoch_start print ("%ds - Epoch=%d, auc=%.4f, train_loss=%.4f, test_loss=%.4f, " "weight sparsity=%.4f") % \ (epoch_span, epoch + 1, auc, train_loss, crossentropy_loss, wei_sparseness) # Model checkpoint, reducing learning rate and early stopping. if auc > best_auc: best_epoch = epoch + 1 best_auc = auc if best_model != "NOSAVE": with open(best_model, 'w') as best_state: pickle.dump(sgd_clf, best_state) if crossentropy_loss < min_loss: min_loss = crossentropy_loss min_loss_epoch = epoch + 1 else: if epoch + 1 - min_loss_epoch >= es_patience: print 'Early stopping criterion has reached. Stop training.' break if epoch + 1 - min_loss_epoch >= lr_patience: curr_lr *= .1 sgd_clf.set_params(eta0=curr_lr) print "Reducing learning rate to: %s" % (curr_lr) # End of training summary print ">>> Found best AUROC: %.4f at epoch: %d, saved to: %s <<<" % \ (best_auc, best_epoch, best_model) print ">>> Found best val loss: %.4f at epoch: %d. <<<" % \ (min_loss, min_loss_epoch) #### Save elastic net model!! #### if final_model != "NOSAVE": with open(final_model, 'w') as final_state: pickle.dump(sgd_clf, final_state)
def run(img_folder, dl_state, img_extension='dcm', img_height=1024, img_scale=255., equalize_hist=False, featurewise_center=False, featurewise_mean=91.6, neg_vs_pos_ratio=1., net='vgg19', batch_size=128, patch_size=256, stride=8, exam_tsv='./metadata/exams_metadata.tsv', img_tsv='./metadata/images_crosswalk.tsv', out='./modelState/prob_heatmap.pkl', predicted_subj_file=None, add_subjs=500): '''Sweep mammograms with trained DL model to create prob heatmaps ''' # Read some env variables. random_seed = int(os.getenv('RANDOM_SEED', 12345)) rng = RandomState(random_seed) # an rng used across board. gpu_count = int(os.getenv('NUM_GPU_DEVICES', 1)) # Load and split image and label lists. meta_man = DMMetaManager(exam_tsv=exam_tsv, img_tsv=img_tsv, img_folder=img_folder, img_extension=img_extension) subj_list, subj_labs = meta_man.get_subj_labs() subj_labs = np.array(subj_labs) print "Found %d subjests" % (len(subj_list)) print "cancer patients=%d, normal patients=%d" \ % ((subj_labs==1).sum(), (subj_labs==0).sum()) if predicted_subj_file is not None: predicted_subjs = np.load(predicted_subj_file) subj_list = np.setdiff1d(subj_list, predicted_subjs) subj_list = subj_list[:add_subjs] print "Will predict additional %d subjects" % (len(subj_list)) elif neg_vs_pos_ratio is not None: subj_list, subj_labs = DMMetaManager.subset_subj_list( subj_list, subj_labs, neg_vs_pos_ratio, random_seed) subj_labs = np.array(subj_labs) print "After subsetting, there are %d subjects" % (len(subj_list)) print "cancer patients=%d, normal patients=%d" \ % ((subj_labs==1).sum(), (subj_labs==0).sum()) # Get exam lists. # >>>> Debug <<<< # # subj_list = subj_list[:2] # >>>> Debug <<<< # print "Get flattened exam list" exam_list = meta_man.get_flatten_exam_list(subj_list, cc_mlo_only=True) exam_labs = meta_man.exam_labs(exam_list) exam_labs = np.array(exam_labs) print "positive exams=%d, negative exams=%d" \ % ((exam_labs==1).sum(), (exam_labs==0).sum()) sys.stdout.flush() # Load DL model. print "Load patch classifier:", dl_state sys.stdout.flush() dl_model = load_model(dl_state, custom_objects={ 'sensitivity': dmm.sensitivity, 'specificity': dmm.specificity }) if gpu_count > 1: print "Make the model parallel on %d GPUs" % (gpu_count) sys.stdout.flush() dl_model, _ = make_parallel(dl_model, gpu_count) parallelized = True else: parallelized = False # Load preprocess function. if featurewise_center: preprocess_input = None else: print "Load preprocess function for net:", net if net == 'resnet50': from keras.applications.resnet50 import preprocess_input elif net == 'vgg16': from keras.applications.vgg16 import preprocess_input elif net == 'vgg19': from keras.applications.vgg19 import preprocess_input elif net == 'xception': from keras.applications.xception import preprocess_input elif net == 'inception': from keras.applications.inception_v3 import preprocess_input else: raise Exception("Pretrained model is not available: " + net) # Sweep the whole images and classify patches. print "Generate prob heatmaps for exam list" sys.stdout.flush() heatmap_dat_list = [] for i, e in enumerate(exam_list): dat = (e[0], e[1], { 'L': { 'cancer': e[2]['L']['cancer'] }, 'R': { 'cancer': e[2]['R']['cancer'] } }) dat[2]['L']['CC'] = get_prob_heatmap( e[2]['L']['CC'], img_height, img_scale, patch_size, stride, dl_model, batch_size, featurewise_center=featurewise_center, featurewise_mean=featurewise_mean, preprocess=preprocess_input, parallelized=parallelized, equalize_hist=equalize_hist) dat[2]['L']['MLO'] = get_prob_heatmap( e[2]['L']['MLO'], img_height, img_scale, patch_size, stride, dl_model, batch_size, featurewise_center=featurewise_center, featurewise_mean=featurewise_mean, preprocess=preprocess_input, parallelized=parallelized, equalize_hist=equalize_hist) dat[2]['R']['CC'] = get_prob_heatmap( e[2]['R']['CC'], img_height, img_scale, patch_size, stride, dl_model, batch_size, featurewise_center=featurewise_center, featurewise_mean=featurewise_mean, preprocess=preprocess_input, parallelized=parallelized, equalize_hist=equalize_hist) dat[2]['R']['MLO'] = get_prob_heatmap( e[2]['R']['MLO'], img_height, img_scale, patch_size, stride, dl_model, batch_size, featurewise_center=featurewise_center, featurewise_mean=featurewise_mean, preprocess=preprocess_input, parallelized=parallelized, equalize_hist=equalize_hist) heatmap_dat_list.append(dat) print "processed %d/%d exams" % (i + 1, len(exam_list)) sys.stdout.flush() ### DEBUG ### # if i >= 1: # break ### DEBUG ### print "Done." # Save the result. print "Saving result to external files.", sys.stdout.flush() pickle.dump(heatmap_dat_list, open(out, 'w')) print "Done."
def run(img_folder, dl_state, img_extension='dcm', img_height=1024, img_scale=4095, val_size=.2, neg_vs_pos_ratio=10., do_featurewise_norm=True, featurewise_mean=873.6, featurewise_std=739.3, img_per_batch=2, roi_per_img=32, roi_size=(256, 256), low_int_threshold=.05, blob_min_area=3, blob_min_int=.5, blob_max_int=.85, blob_th_step=10, layer_name=['flatten_1', 'dense_1'], layer_index=None, roi_state=None, roi_clf_bs=32, pc_components=.95, pc_whiten=True, nb_words=[512], km_max_iter=100, km_bs=1000, km_patience=20, km_init=10, exam_tsv='./metadata/exams_metadata.tsv', img_tsv='./metadata/images_crosswalk.tsv', pca_km_states='./modelState/dlrepr_pca_km_models.pkl', bow_train_out='./modelState/bow_dat_train.pkl', bow_test_out='./modelState/bow_dat_test.pkl'): '''Calculate bag of deep visual words count matrix for all breasts ''' # Read some env variables. random_seed = int(os.getenv('RANDOM_SEED', 12345)) rng = RandomState(random_seed) # an rng used across board. # Load and split image and label lists. meta_man = DMMetaManager(exam_tsv=exam_tsv, img_tsv=img_tsv, img_folder=img_folder, img_extension=img_extension) subj_list, subj_labs = meta_man.get_subj_labs() subj_train, subj_test, labs_train, labs_test = train_test_split( subj_list, subj_labs, test_size=val_size, stratify=subj_labs, random_state=random_seed) if neg_vs_pos_ratio is not None: def subset_subj(subj, labs): subj = np.array(subj) labs = np.array(labs) pos_idx = np.where(labs == 1)[0] neg_idx = np.where(labs == 0)[0] nb_neg_desired = int(len(pos_idx) * neg_vs_pos_ratio) if nb_neg_desired >= len(neg_idx): return subj.tolist() else: neg_chosen = rng.choice(neg_idx, nb_neg_desired, replace=False) subset_idx = np.concatenate([pos_idx, neg_chosen]) return subj[subset_idx].tolist() subj_train = subset_subj(subj_train, labs_train) subj_test = subset_subj(subj_test, labs_test) img_list, lab_list = meta_man.get_flatten_img_list(subj_train) lab_list = np.array(lab_list) print "Train set - Nb of positive images: %d, Nb of negative images: %d" \ % ( (lab_list==1).sum(), (lab_list==0).sum()) sys.stdout.flush() # Create image generator for ROIs for representation extraction. print "Create an image generator for ROIs" sys.stdout.flush() if do_featurewise_norm: imgen = DMImageDataGenerator(featurewise_center=True, featurewise_std_normalization=True) imgen.mean = featurewise_mean imgen.std = featurewise_std else: imgen = DMImageDataGenerator(samplewise_center=True, samplewise_std_normalization=True) # Load ROI classifier. if roi_state is not None: print "Load ROI classifier" sys.stdout.flush() roi_clf = load_model(roi_state, custom_objects={ 'sensitivity': dmm.sensitivity, 'specificity': dmm.specificity }) graph = tf.get_default_graph() else: roi_clf = None graph = None # Create ROI generators for pos and neg images separately. print "Create ROI generators for pos and neg images" sys.stdout.flush() roi_generator = imgen.flow_from_candid_roi( img_list, target_height=img_height, target_scale=img_scale, class_mode=None, validation_mode=True, img_per_batch=img_per_batch, roi_per_img=roi_per_img, roi_size=roi_size, low_int_threshold=low_int_threshold, blob_min_area=blob_min_area, blob_min_int=blob_min_int, blob_max_int=blob_max_int, blob_th_step=blob_th_step, tf_graph=graph, roi_clf=roi_clf, clf_bs=roi_clf_bs, return_sample_weight=False, seed=random_seed) # Generate image patches and extract their DL representations. print "Load DL representation model" sys.stdout.flush() dlrepr_model = DLRepr(dl_state, custom_objects={ 'sensitivity': dmm.sensitivity, 'specificity': dmm.specificity }, layer_name=layer_name, layer_index=layer_index) last_output_size = dlrepr_model.get_output_shape()[-1][-1] if last_output_size != 3 and last_output_size != 1: raise Exception("The last output must be prob outputs (size=3 or 1)") nb_tot_samples = len(img_list) * roi_per_img print "Extract ROIs from pos and neg images" sys.stdout.flush() pred = dlrepr_model.predict_generator(roi_generator, val_samples=nb_tot_samples) for i, d in enumerate(pred): print "Shape of representation/output data %d:" % (i), d.shape sys.stdout.flush() # Flatten feature maps, e.g. an 8x8 feature map will become a 64-d vector. pred = [d.reshape((-1, d.shape[-1])) for d in pred] for i, d in enumerate(pred): print "Shape of flattened data %d:" % (i), d.shape sys.stdout.flush() # Split representations and prob outputs. dl_repr = pred[0] prob_out = pred[1] if prob_out.shape[1] == 3: prob_out = prob_out[:, 1] # pos class. prob_out = prob_out.reshape((len(img_list), -1)) print "Reshape prob output to:", prob_out.shape sys.stdout.flush() # Use PCA to reduce dimension of the representation data. if pc_components is not None: print "Start PCA dimension reduction on DL representation" sys.stdout.flush() pca = PCA(n_components=pc_components, whiten=pc_whiten) pca.fit(dl_repr) print "Nb of PCA components:", pca.n_components_ print "Total explained variance ratio: %.4f" % \ (pca.explained_variance_ratio_.sum()) dl_repr_pca = pca.transform(dl_repr) print "Shape of transformed representation data:", dl_repr_pca.shape sys.stdout.flush() else: pca = None # Use K-means to create a codebook for deep visual words. print "Start K-means training on DL representation" sys.stdout.flush() clf_list = [] clust_list = [] # Shuffling indices for mini-batches learning. perm_idx = rng.permutation(len(dl_repr)) for n in nb_words: print "Train K-means with %d cluster centers" % (n) sys.stdout.flush() clf = MiniBatchKMeans(n_clusters=n, init='k-means++', max_iter=km_max_iter, batch_size=km_bs, compute_labels=True, random_state=random_seed, tol=0.0, max_no_improvement=km_patience, init_size=None, n_init=km_init, reassignment_ratio=0.01, verbose=0) clf.fit(dl_repr[perm_idx]) clf_list.append(clf) clust = np.zeros_like(clf.labels_) clust[perm_idx] = clf.labels_ clust = clust.reshape((len(img_list), -1)) clust_list.append(clust) if pca is not None: print "Start K-means training on transformed representation" sys.stdout.flush() clf_list_pca = [] clust_list_pca = [] # Shuffling indices for mini-batches learning. perm_idx = rng.permutation(len(dl_repr_pca)) for n in nb_words: print "Train K-means with %d cluster centers" % (n) sys.stdout.flush() clf = MiniBatchKMeans(n_clusters=n, init='k-means++', max_iter=km_max_iter, batch_size=km_bs, compute_labels=True, random_state=random_seed, tol=0.0, max_no_improvement=km_patience, init_size=None, n_init=km_init, reassignment_ratio=0.01, verbose=0) clf.fit(dl_repr_pca[perm_idx]) clf_list_pca.append(clf) clust = np.zeros_like(clf.labels_) clust[perm_idx] = clf.labels_ clust = clust.reshape((len(img_list), -1)) clust_list_pca.append(clust) # Read exam lists. exam_train = meta_man.get_flatten_exam_list(subj_train, flatten_img_list=True) exam_test = meta_man.get_flatten_exam_list(subj_test, flatten_img_list=True) exam_labs_train = np.array(meta_man.exam_labs(exam_train)) exam_labs_test = np.array(meta_man.exam_labs(exam_test)) nb_pos_exams_train = (exam_labs_train == 1).sum() nb_neg_exams_train = (exam_labs_train == 0).sum() nb_pos_exams_test = (exam_labs_test == 1).sum() nb_neg_exams_test = (exam_labs_test == 0).sum() print "Train set - Nb of pos exams: %d, Nb of neg exams: %d" % \ (nb_pos_exams_train, nb_neg_exams_train) print "Test set - Nb of pos exams: %d, Nb of neg exams: %d" % \ (nb_pos_exams_test, nb_neg_exams_test) # Do BoW counts for each breast. print "BoW counting for train exam list" sys.stdout.flush() bow_dat_train = get_exam_bow_dat(exam_train, nb_words, roi_per_img, img_list=img_list, prob_out=prob_out, clust_list=clust_list) for i, d in enumerate(bow_dat_train[1]): print "Shape of train BoW matrix %d:" % (i), d.shape sys.stdout.flush() print "BoW counting for test exam list" sys.stdout.flush() bow_dat_test = get_exam_bow_dat(exam_test, nb_words, roi_per_img, imgen=imgen, clf_list=clf_list, transformer=None, target_height=img_height, target_scale=img_scale, img_per_batch=img_per_batch, roi_size=roi_size, low_int_threshold=low_int_threshold, blob_min_area=blob_min_area, blob_min_int=blob_min_int, blob_max_int=blob_max_int, blob_th_step=blob_th_step, seed=random_seed, dlrepr_model=dlrepr_model) for i, d in enumerate(bow_dat_test[1]): print "Shape of test BoW matrix %d:" % (i), d.shape sys.stdout.flush() if pca is not None: print "== Do same BoW counting on PCA transformed data ==" print "BoW counting for train exam list" sys.stdout.flush() bow_dat_train_pca = get_exam_bow_dat(exam_train, nb_words, roi_per_img, img_list=img_list, prob_out=prob_out, clust_list=clust_list_pca) for i, d in enumerate(bow_dat_train_pca[1]): print "Shape of train BoW matrix %d:" % (i), d.shape sys.stdout.flush() print "BoW counting for test exam list" sys.stdout.flush() bow_dat_test_pca = get_exam_bow_dat( exam_test, nb_words, roi_per_img, imgen=imgen, clf_list=clf_list_pca, transformer=pca, target_height=img_height, target_scale=img_scale, img_per_batch=img_per_batch, roi_size=roi_size, low_int_threshold=low_int_threshold, blob_min_area=blob_min_area, blob_min_int=blob_min_int, blob_max_int=blob_max_int, blob_th_step=blob_th_step, seed=random_seed, dlrepr_model=dlrepr_model) for i, d in enumerate(bow_dat_test_pca[1]): print "Shape of test BoW matrix %d:" % (i), d.shape sys.stdout.flush() # Save K-means model and BoW count data. if pca is None: pickle.dump(clf_list, open(pca_km_states, 'w')) pickle.dump(bow_dat_train, open(bow_train_out, 'w')) pickle.dump(bow_dat_test, open(bow_test_out, 'w')) else: pickle.dump((pca, clf_list), open(pca_km_states, 'w')) pickle.dump((bow_dat_train, bow_dat_train_pca), open(bow_train_out, 'w')) pickle.dump((bow_dat_test, bow_dat_test_pca), open(bow_test_out, 'w')) print "Done."
def run(img_folder, dl_state, clf_info_state, img_extension='dcm', img_height=4096, img_scale=255., equalize_hist=False, featurewise_center=False, featurewise_mean=91.6, net='resnet50', batch_size=64, patch_size=256, stride=64, exam_tsv='./metadata/exams_metadata.tsv', img_tsv='./metadata/images_crosswalk.tsv', validation_mode=False, use_mean=False, out_pred='./output/predictions.tsv', progress='./progress.txt'): '''Run SC1 inference using prob heatmaps ''' # Read some env variables. random_seed = int(os.getenv('RANDOM_SEED', 12345)) rng = np.random.RandomState(random_seed) # an rng used across board. gpu_count = int(os.getenv('NUM_GPU_DEVICES', 1)) # Setup data generator for inference. meta_man = DMMetaManager(img_tsv=img_tsv, exam_tsv=exam_tsv, img_folder=img_folder, img_extension=img_extension) if validation_mode: exam_list = meta_man.get_flatten_exam_list(cc_mlo_only=True) exam_labs = meta_man.exam_labs(exam_list) exam_labs = np.array(exam_labs) print "positive exams=%d, negative exams=%d" \ % ((exam_labs==1).sum(), (exam_labs==0).sum()) sys.stdout.flush() else: exam_list = meta_man.get_last_exam_list(cc_mlo_only=True) exam_labs = None # Load DL model and classifiers. print "Load patch classifier:", dl_state sys.stdout.flush() dl_model = load_model(dl_state) if gpu_count > 1: print "Make the model parallel on %d GPUs" % (gpu_count) sys.stdout.flush() dl_model, _ = make_parallel(dl_model, gpu_count) parallelized = True else: parallelized = False feature_name, nb_phm, cutoff_list, k, clf_list = \ pickle.load(open(clf_info_state)) # Load preprocess function. if featurewise_center: preprocess_input = None else: print "Load preprocess function for net:", net if net == 'resnet50': from keras.applications.resnet50 import preprocess_input elif net == 'vgg16': from keras.applications.vgg16 import preprocess_input elif net == 'vgg19': from keras.applications.vgg19 import preprocess_input elif net == 'xception': from keras.applications.xception import preprocess_input elif net == 'inception': from keras.applications.inception_v3 import preprocess_input else: raise Exception("Pretrained model is not available: " + net) # Print header. fout = open(out_pred, 'w') if validation_mode: fout.write(dminfer.INFER_HEADER_VAL) else: fout.write(dminfer.INFER_HEADER) print "Start inference for exam list" sys.stdout.flush() for i, e in enumerate(exam_list): ### DEBUG ### # if i >= 3: # break ### DEBUG ### subj = e[0] exam_idx = e[1] if validation_mode: left_cancer = e[2]['L']['cancer'] right_cancer = e[2]['R']['cancer'] left_cancer = 0 if np.isnan(left_cancer) else left_cancer right_cancer = 0 if np.isnan(right_cancer) else right_cancer try: left_cc_phms = get_prob_heatmap( e[2]['L']['CC'], img_height, img_scale, patch_size, stride, dl_model, batch_size, featurewise_center=featurewise_center, featurewise_mean=featurewise_mean, preprocess=preprocess_input, parallelized=parallelized, equalize_hist=equalize_hist) except: left_cc_phms = [None] try: left_mlo_phms = get_prob_heatmap( e[2]['L']['MLO'], img_height, img_scale, patch_size, stride, dl_model, batch_size, featurewise_center=featurewise_center, featurewise_mean=featurewise_mean, preprocess=preprocess_input, parallelized=parallelized, equalize_hist=equalize_hist) except: left_mlo_phms = [None] try: right_cc_phms = get_prob_heatmap( e[2]['R']['CC'], img_height, img_scale, patch_size, stride, dl_model, batch_size, featurewise_center=featurewise_center, featurewise_mean=featurewise_mean, preprocess=preprocess_input, parallelized=parallelized, equalize_hist=equalize_hist) except: right_cc_phms = [None] try: right_mlo_phms = get_prob_heatmap( e[2]['R']['MLO'], img_height, img_scale, patch_size, stride, dl_model, batch_size, featurewise_center=featurewise_center, featurewise_mean=featurewise_mean, preprocess=preprocess_input, parallelized=parallelized, equalize_hist=equalize_hist) except: right_mlo_phms = [None] try: left_pred = dminfer.make_pred_case(left_cc_phms, left_mlo_phms, feature_name, cutoff_list, clf_list, k=k, nb_phm=nb_phm, use_mean=use_mean) except: print "Exception in predicting left breast" + \ " for subj:", subj, "exam:", exam_idx sys.stdout.flush() left_pred = 0. try: right_pred = dminfer.make_pred_case(right_cc_phms, right_mlo_phms, feature_name, cutoff_list, clf_list, k=k, nb_phm=nb_phm, use_mean=use_mean) except: print "Exception in predicting right breast" + \ " for subj:", subj, "exam:", exam_idx sys.stdout.flush() right_pred = 0. if validation_mode: fout.write("%s\t%s\tL\t%f\t%f\n" % \ (str(subj), str(exam_idx), left_pred, left_cancer)) fout.write("%s\t%s\tR\t%f\t%f\n" % \ (str(subj), str(exam_idx), right_pred, right_cancer)) fout.flush() else: fout.write("%s\tL\t%f\n" % (str(subj), left_pred)) fout.write("%s\tR\t%f\n" % (str(subj), right_pred)) fout.flush() print "processed %d/%d exams" % (i + 1, len(exam_list)) sys.stdout.flush() with open(progress, 'w') as fpro: fpro.write("%f\n" % ((i + 1.) / len(exam_list))) print "Done." fout.close()