def test_reinhard_stats(self): wsi_path = os.path.join( utilities.externaldata( 'data/sample_svs_image.TCGA-DU-6399-01A-01-TS1.e8eb65de-d63e-42db-af6f-14fefbbdf7bd.svs.sha512' # noqa )) np.random.seed(1) # create dask client args = { # In Python 3 unittesting, the scheduler fails if it uses processes 'scheduler': 'multithreading', # None, 'num_workers': -1, 'num_threads_per_worker': 1, } args = collections.namedtuple('Parameters', args.keys())(**args) cli_utils.create_dask_client(args) # compute reinhard stats wsi_mean, wsi_stddev = htk_cn.reinhard_stats(wsi_path, 0.1, magnification=20) gt_mean = [8.896134, -0.074579, 0.022006] gt_stddev = [0.612143, 0.122667, 0.021361] np.testing.assert_allclose(wsi_mean, gt_mean, atol=1e-2) np.testing.assert_allclose(wsi_stddev, gt_stddev, atol=1e-2)
def test_background_intensity(self): wsi_path = os.path.join( utilities.externaldata( 'data/sample_svs_image.TCGA-DU-6399-01A-01-TS1.e8eb65de-d63e-42db-af6f-14fefbbdf7bd.svs.sha512' # noqa )) np.random.seed(1) # create dask client args = { # In Python 3 unittesting, the scheduler fails if it uses processes 'scheduler': 'multithreading', # None, 'num_workers': -1, 'num_threads_per_worker': 1, } args = collections.namedtuple('Parameters', args.keys())(**args) cli_utils.create_dask_client(args) # compute background intensity I_0 = htk_cn.background_intensity(wsi_path, sample_approximate_total=5000) np.testing.assert_allclose(I_0, [242, 244, 241], atol=1)
def main(args): args = utils.splitArgs(args) args.snmf.I_0 = numpy.array(args.snmf.I_0) print(">> Starting Dask cluster and sampling pixels") utils.create_dask_client(args.dask) sample = utils.sample_pixels(args.sample) # Create stain matrix print('>> Creating stain matrix') args.snmf.w_init = utils.get_stain_matrix(args.stains, 2) print(args.snmf.w_init) # Perform color deconvolution print('>> Performing color deconvolution') w_est = htk_cdeconv.rgb_separate_stains_xu_snmf(sample.T, **vars(args.snmf)) w_est = htk_cdeconv.complement_stain_matrix(w_est) with open(args.returnParameterFile, 'w') as f: for i, stain in enumerate(w_est.T): f.write('stainColor_{} = {}\n'.format(i + 1, ','.join(map(str, stain))))
def main(args): args = utils.splitArgs(args) args.macenko.I_0 = numpy.array(args.macenko.I_0) utils.create_dask_client(args.dask) sample = utils.sample_pixels(args.sample) stain_matrix = rgb_separate_stains_macenko_pca(sample.T, **vars(args.macenko)) with open(args.returnParameterFile, 'w') as f: for i, stain in enumerate(stain_matrix.T): f.write('stainColor_{} = {}\n'.format(i + 1, ','.join(map(str, stain))))
def main(args): other_args = set(['returnParameterFile', 'scheduler']) kwargs = {k: v for k, v in vars(args).items() if k not in other_args} # Allow (some) default parameters to work. Assume certain values # are not valid. for k in 'sample_fraction', 'sample_approximate_total': if kwargs[k] == -1: del kwargs[k] utils.create_dask_client(args) I_0 = background_intensity(**kwargs) with open(args.returnParameterFile, 'w') as f: f.write('BackgroundIntensity = ' + ','.join(map(str, I_0)) + '\n')
def main(args): utils.create_dask_client(args) ts = large_image.getTileSource(args.inputImageFile) make_label_image = getattr(args, 'outputLabelImage', None) is not None region = utils.get_region_dict( args.region, *(args.maxRegionSize, ts) if make_label_image else ()).get('region') ppc_params = ppc.Parameters( **{k: getattr(args, k) for k in ppc.Parameters._fields}) results = ppc.count_slide( args.inputImageFile, ppc_params, region, args.tile_grouping, make_label_image, ) if make_label_image: stats, label_image = results # Colorize label image. Colors from the "coolwarm" color map color_map = np.empty((4, 3), dtype=np.uint8) color_map[ppc.Labels.NEGATIVE] = 255 color_map[ppc.Labels.WEAK] = 60, 78, 194 color_map[ppc.Labels.PLAIN] = 221, 220, 220 color_map[ppc.Labels.STRONG] = 180, 4, 38 # Cleverly index color_map label_image = color_map[label_image] try: skimage.io.imsave(args.outputLabelImage, label_image) except ValueError: # This is likely caused by an unknown extension, so try again altname = args.outputLabelImage + '.png' skimage.io.imsave(altname, label_image) os.rename(altname, args.outputLabelImage) else: stats, = results with open(args.returnParameterFile, 'w') as f: for k, v in zip(stats._fields, stats): f.write(f'{k} = {v}\n')
def test_create_tile_nuclei_annotations(self): wsi_path = os.path.join( utilities.externaldata( 'data/TCGA-06-0129-01Z-00-DX3.bae772ea-dd36-47ec-8185-761989be3cc8.svs.sha512' # noqa )) # define parameters args = { 'reference_mu_lab': [8.63234435, -0.11501964, 0.03868433], 'reference_std_lab': [0.57506023, 0.10403329, 0.01364062], 'stain_1': 'hematoxylin', 'stain_2': 'eosin', 'stain_3': 'null', 'stain_1_vector': [-1, -1, -1], 'stain_2_vector': [-1, -1, -1], 'stain_3_vector': [-1, -1, -1], 'min_fgnd_frac': 0.50, 'analysis_mag': 20, 'analysis_tile_size': 1200, 'foreground_threshold': 60, 'min_radius': 6, 'max_radius': 12, 'min_nucleus_area': 25, 'local_max_search_radius': 8, # In Python 3 unittesting, the scheduler fails if it uses processes 'scheduler': 'multithreading', # None, 'num_workers': -1, 'num_threads_per_worker': 1, } args = collections.namedtuple('Parameters', args.keys())(**args) # read WSI ts = large_image.getTileSource(wsi_path) ts_metadata = ts.getMetadata() analysis_tile_size = { 'width': int(ts_metadata['tileWidth'] * np.floor( 1.0 * args.analysis_tile_size / ts_metadata['tileWidth'])), 'height': int(ts_metadata['tileHeight'] * np.floor( 1.0 * args.analysis_tile_size / ts_metadata['tileHeight'])) } # define ROI roi = { 'left': ts_metadata['sizeX'] / 2, 'top': ts_metadata['sizeY'] * 3 / 4, 'width': analysis_tile_size['width'], 'height': analysis_tile_size['height'], 'units': 'base_pixels' } # define tile iterator parameters it_kwargs = { 'tile_size': { 'width': args.analysis_tile_size }, 'scale': { 'magnification': args.analysis_mag }, 'region': roi } # create dask client cli_utils.create_dask_client(args) # get tile foregreoung at low res im_fgnd_mask_lres, fgnd_seg_scale = \ cli_utils.segment_wsi_foreground_at_low_res(ts) # compute tile foreground fraction tile_fgnd_frac_list = htk_utils.compute_tile_foreground_fraction( wsi_path, im_fgnd_mask_lres, fgnd_seg_scale, it_kwargs) num_fgnd_tiles = np.count_nonzero( tile_fgnd_frac_list >= args.min_fgnd_frac) np.testing.assert_equal(num_fgnd_tiles, 2) # create nuclei annotations nuclei_bbox_annot_list = [] nuclei_bndry_annot_list = [] for tile_info in ts.tileIterator( format=large_image.tilesource.TILE_FORMAT_NUMPY, **it_kwargs): im_tile = tile_info['tile'][:, :, :3] # perform color normalization im_nmzd = htk_cnorm.reinhard(im_tile, args.reference_mu_lab, args.reference_std_lab) # perform color deconvolution w = cli_utils.get_stain_matrix(args) im_stains = htk_cdeconv.color_deconvolution(im_nmzd, w).Stains im_nuclei_stain = im_stains[:, :, 0].astype(np.float) # segment nuclei im_nuclei_seg_mask = htk_nuclear.detect_nuclei_kofahi( im_nuclei_stain, im_nuclei_stain < args.foreground_threshold, args.min_radius, args.max_radius, args.min_nucleus_area, args.local_max_search_radius) # generate nuclei annotations as bboxes cur_bbox_annot_list = cli_utils.create_tile_nuclei_annotations( im_nuclei_seg_mask, tile_info, 'bbox') nuclei_bbox_annot_list.extend(cur_bbox_annot_list) # generate nuclei annotations as boundaries cur_bndry_annot_list = cli_utils.create_tile_nuclei_annotations( im_nuclei_seg_mask, tile_info, 'boundary') nuclei_bndry_annot_list.extend(cur_bndry_annot_list) # compare nuclei bbox annotations with gtruth nuclei_bbox_annot_gtruth_file = os.path.join( utilities.externaldata( 'data/TCGA-06-0129-01Z-00-DX3_roi_nuclei_bbox.anot.sha512' # noqa )) with open(nuclei_bbox_annot_gtruth_file, 'r') as fbbox_annot: nuclei_bbox_annot_list_gtruth = json.load(fbbox_annot)['elements'] # Check that nuclei_bbox_annot_list is nearly equal to # nuclei_bbox_annot_list_gtruth assert len(nuclei_bbox_annot_list) == len( nuclei_bbox_annot_list_gtruth) for pos in range(len(nuclei_bbox_annot_list)): np.testing.assert_array_almost_equal( nuclei_bbox_annot_list[pos]['center'], nuclei_bbox_annot_list_gtruth[pos]['center'], 0) np.testing.assert_almost_equal( nuclei_bbox_annot_list[pos]['width'], nuclei_bbox_annot_list_gtruth[pos]['width'], 1) np.testing.assert_almost_equal( nuclei_bbox_annot_list[pos]['height'], nuclei_bbox_annot_list_gtruth[pos]['height'], 1) # compare nuclei boundary annotations with gtruth nuclei_bndry_annot_gtruth_file = os.path.join( utilities.externaldata( 'data/TCGA-06-0129-01Z-00-DX3_roi_nuclei_boundary.anot.sha512' # noqa )) with open(nuclei_bndry_annot_gtruth_file, 'r') as fbndry_annot: nuclei_bndry_annot_list_gtruth = json.load( fbndry_annot)['elements'] assert len(nuclei_bndry_annot_list) == len( nuclei_bndry_annot_list_gtruth) for pos in range(len(nuclei_bndry_annot_list)): np.testing.assert_array_almost_equal( nuclei_bndry_annot_list[pos]['points'], nuclei_bndry_annot_list_gtruth[pos]['points'], 0)
def main(args): print('\n>> CLI Parameters ...\n') print(args) # # Initiate Dask client # print('\n>> Creating Dask client ...\n') c = cli_utils.create_dask_client(args) print(c) # # read model file # print('\n>> Loading classification model ...\n') clf_model = joblib.load(args.inputModelFile) # # read feature file # print('\n>> Loading nuclei feature file ...\n') ddf = read_feature_file(args) if len(ddf.columns) != clf_model.n_features_: raise ValueError('The number of features of the classification model ' 'and the input feature file do not match.') # # read nuclei annotation file # print('\n>> Loading nuclei annotation file ...\n') with open(args.inputNucleiAnnotationFile) as f: nuclei_annot_list = json.load(f)['elements'] if len(nuclei_annot_list) != len(ddf.index): raise ValueError('The number of nuclei in the feature file and the ' 'annotation file do not match') # # Perform nuclei classification # print('\n>> Performing nuclei classification using Dask ...\n') def predict_nuclei_class_prob(df, clf_model): return pd.DataFrame(data=clf_model.predict_proba(df.as_matrix()), columns=clf_model.classes_) outfmt = pd.DataFrame(columns=clf_model.classes_, dtype=np.float64) df_class_prob = ddf.map_partitions(predict_nuclei_class_prob, clf_model, meta=outfmt).compute() pred_class = df_class_prob.idxmax(axis=1) # # Group nuclei annotations by class # print('\n>> Grouping nuclei annotations by class ...\n') num_classes = len(clf_model.classes_) nuclei_annot_by_class = {c: [] for c in clf_model.classes_} class_color_map = dict(zip(clf_model.classes_, gen_distinct_rgb_colors(num_classes, seed=1))) for i in range(len(nuclei_annot_list)): cur_class = pred_class.iloc[i] cur_anot = nuclei_annot_list[i] cur_anot['lineColor'] = 'rgb(%s)' % ','.join( [str(int(round(col*255))) for col in class_color_map[cur_class]]) nuclei_annot_by_class[cur_class].append(cur_anot) # # Write annotation file # print('\n>> Writing classified nuclei annotation file ...\n') annot_fname = os.path.splitext( os.path.basename(args.outputNucleiAnnotationFile))[0] annotation = [] for c in clf_model.classes_: annotation.append({ "name": annot_fname + '-nuclei-class-' + str(c), "elements": nuclei_annot_by_class[c] }) with open(args.outputNucleiAnnotationFile, 'w') as annotation_file: json.dump(annotation, annotation_file, indent=2, sort_keys=False)
def main(args): total_start_time = time.time() print('\n>> CLI Parameters ...\n') print(args) check_args(args) feature_file_format = os.path.splitext(args.outputNucleiFeatureFile)[1] if np.all(np.array(args.analysis_roi) == -1): process_whole_image = True else: process_whole_image = False # # Initiate Dask client # print('\n>> Creating Dask client ...\n') start_time = time.time() c = cli_utils.create_dask_client(args) print(c) dask_setup_time = time.time() - start_time print(f'Dask setup time = {dask_setup_time} seconds') # # Read Input Image # print('\n>> Reading input image ... \n') ts = large_image.getTileSource(args.inputImageFile) ts_metadata = ts.getMetadata() print(json.dumps(ts_metadata, indent=2)) is_wsi = ts_metadata['magnification'] is not None # # Compute tissue/foreground mask at low-res for whole slide images # if is_wsi and process_whole_image: print('\n>> Computing tissue/foreground mask at low-res ...\n') start_time = time.time() im_fgnd_mask_lres, fgnd_seg_scale = \ cli_utils.segment_wsi_foreground_at_low_res(ts) fgnd_time = time.time() - start_time print('low-res foreground mask computation time = {}'.format( cli_utils.disp_time_hms(fgnd_time))) # # Compute foreground fraction of tiles in parallel using Dask # tile_fgnd_frac_list = [1.0] it_kwargs = { 'tile_size': {'width': args.analysis_tile_size}, 'scale': {'magnification': args.analysis_mag}, } if not process_whole_image: it_kwargs['region'] = { 'left': args.analysis_roi[0], 'top': args.analysis_roi[1], 'width': args.analysis_roi[2], 'height': args.analysis_roi[3], 'units': 'base_pixels' } if is_wsi: print('\n>> Computing foreground fraction of all tiles ...\n') start_time = time.time() num_tiles = ts.getSingleTile(**it_kwargs)['iterator_range']['position'] print(f'Number of tiles = {num_tiles}') if process_whole_image: tile_fgnd_frac_list = htk_utils.compute_tile_foreground_fraction( args.inputImageFile, im_fgnd_mask_lres, fgnd_seg_scale, it_kwargs ) else: tile_fgnd_frac_list = [1.0] * num_tiles num_fgnd_tiles = np.count_nonzero( tile_fgnd_frac_list >= args.min_fgnd_frac) percent_fgnd_tiles = 100.0 * num_fgnd_tiles / num_tiles fgnd_frac_comp_time = time.time() - start_time print('Number of foreground tiles = {:d} ({:2f}%%)'.format( num_fgnd_tiles, percent_fgnd_tiles)) print('Tile foreground fraction computation time = {}'.format( cli_utils.disp_time_hms(fgnd_frac_comp_time))) # # Compute reinhard stats for color normalization # src_mu_lab = None src_sigma_lab = None if is_wsi and process_whole_image: print('\n>> Computing reinhard color normalization stats ...\n') start_time = time.time() src_mu_lab, src_sigma_lab = htk_cnorm.reinhard_stats( args.inputImageFile, 0.01, magnification=args.analysis_mag) rstats_time = time.time() - start_time print('Reinhard stats computation time = {}'.format( cli_utils.disp_time_hms(rstats_time))) # # Detect and compute nuclei features in parallel using Dask # print('\n>> Detecting nuclei and computing features ...\n') start_time = time.time() tile_result_list = [] for tile in ts.tileIterator(**it_kwargs): tile_position = tile['tile_position']['position'] if is_wsi and tile_fgnd_frac_list[tile_position] <= args.min_fgnd_frac: continue # detect nuclei cur_result = dask.delayed(compute_tile_nuclei_features)( args.inputImageFile, tile_position, args, it_kwargs, src_mu_lab, src_sigma_lab ) # append result to list tile_result_list.append(cur_result) tile_result_list = dask.delayed(tile_result_list).compute() nuclei_annot_list = [annot for annot_list, fdata in tile_result_list for annot in annot_list] nuclei_fdata = pd.DataFrame() if len(nuclei_annot_list) > 0: nuclei_fdata = pd.concat([ fdata for annot_list, fdata in tile_result_list if fdata is not None], ignore_index=True ) nuclei_detection_time = time.time() - start_time print('Number of nuclei = {}'.format(len(nuclei_annot_list))) print('Nuclei detection time = {}'.format( cli_utils.disp_time_hms(nuclei_detection_time))) # # Write annotation file # print('\n>> Writing annotation file ...\n') annot_fname = os.path.splitext( os.path.basename(args.outputNucleiAnnotationFile))[0] annotation = { 'name': annot_fname + '-nuclei-' + args.nuclei_annotation_format, 'elements': nuclei_annot_list } with open(args.outputNucleiAnnotationFile, 'w') as annotation_file: json.dump(annotation, annotation_file, indent=2, sort_keys=False) # # Create CSV Feature file # print('>> Writing CSV feature file') if feature_file_format == '.csv': nuclei_fdata.to_csv(args.outputNucleiFeatureFile, index=False) elif feature_file_format == '.h5': nuclei_fdata.to_hdf(args.outputNucleiFeatureFile, 'Features', format='table', mode='w') else: raise ValueError('Extension of output feature file must be .csv or .h5') total_time_taken = time.time() - total_start_time print('Total analysis time = {}'.format( cli_utils.disp_time_hms(total_time_taken)))
def main(args): total_start_time = time.time() print('\n>> CLI Parameters ...\n') print(args) if not os.path.isfile(args.inputImageFile): raise OSError('Input image file does not exist.') if len(args.reference_mu_lab) != 3: raise ValueError('Reference Mean LAB should be a 3 element vector.') if len(args.reference_std_lab) != 3: raise ValueError('Reference Stddev LAB should be a 3 element vector.') if len(args.analysis_roi) != 4: raise ValueError('Analysis ROI must be a vector of 4 elements.') if np.all(np.array(args.analysis_roi) == -1): process_whole_image = True else: process_whole_image = False # # Initiate Dask client # print('\n>> Creating Dask client ...\n') start_time = time.time() c = cli_utils.create_dask_client(args) print(c) dask_setup_time = time.time() - start_time print('Dask setup time = {}'.format( cli_utils.disp_time_hms(dask_setup_time))) # # Read Input Image # print('\n>> Reading input image ... \n') ts = large_image.getTileSource(args.inputImageFile) ts_metadata = ts.getMetadata() print(json.dumps(ts_metadata, indent=2)) is_wsi = ts_metadata['magnification'] is not None # # Compute tissue/foreground mask at low-res for whole slide images # if is_wsi and process_whole_image: print('\n>> Computing tissue/foreground mask at low-res ...\n') start_time = time.time() im_fgnd_mask_lres, fgnd_seg_scale = \ cli_utils.segment_wsi_foreground_at_low_res(ts) fgnd_time = time.time() - start_time print('low-res foreground mask computation time = {}'.format( cli_utils.disp_time_hms(fgnd_time))) # # Compute foreground fraction of tiles in parallel using Dask # tile_fgnd_frac_list = [1.0] it_kwargs = { 'tile_size': { 'width': args.analysis_tile_size }, 'scale': { 'magnification': args.analysis_mag }, } if not process_whole_image: it_kwargs['region'] = { 'left': args.analysis_roi[0], 'top': args.analysis_roi[1], 'width': args.analysis_roi[2], 'height': args.analysis_roi[3], 'units': 'base_pixels' } if is_wsi: print('\n>> Computing foreground fraction of all tiles ...\n') start_time = time.time() num_tiles = ts.getSingleTile(**it_kwargs)['iterator_range']['position'] print(f'Number of tiles = {num_tiles}') if process_whole_image: tile_fgnd_frac_list = htk_utils.compute_tile_foreground_fraction( args.inputImageFile, im_fgnd_mask_lres, fgnd_seg_scale, it_kwargs) else: tile_fgnd_frac_list = np.full(num_tiles, 1.0) num_fgnd_tiles = np.count_nonzero( tile_fgnd_frac_list >= args.min_fgnd_frac) percent_fgnd_tiles = 100.0 * num_fgnd_tiles / num_tiles fgnd_frac_comp_time = time.time() - start_time print('Number of foreground tiles = {:d} ({:2f}%%)'.format( num_fgnd_tiles, percent_fgnd_tiles)) print('Tile foreground fraction computation time = {}'.format( cli_utils.disp_time_hms(fgnd_frac_comp_time))) # # Compute reinhard stats for color normalization # src_mu_lab = None src_sigma_lab = None if is_wsi and process_whole_image: print('\n>> Computing reinhard color normalization stats ...\n') start_time = time.time() src_mu_lab, src_sigma_lab = htk_cnorm.reinhard_stats( args.inputImageFile, 0.01, magnification=args.analysis_mag) rstats_time = time.time() - start_time print('Reinhard stats computation time = {}'.format( cli_utils.disp_time_hms(rstats_time))) # # Detect nuclei in parallel using Dask # print('\n>> Detecting nuclei ...\n') start_time = time.time() tile_nuclei_list = [] for tile in ts.tileIterator(**it_kwargs): tile_position = tile['tile_position']['position'] if is_wsi and tile_fgnd_frac_list[tile_position] <= args.min_fgnd_frac: continue # detect nuclei cur_nuclei_list = dask.delayed(detect_tile_nuclei)(args.inputImageFile, tile_position, args, it_kwargs, src_mu_lab, src_sigma_lab) # append result to list tile_nuclei_list.append(cur_nuclei_list) tile_nuclei_list = dask.delayed(tile_nuclei_list).compute() nuclei_list = [ anot for anot_list in tile_nuclei_list for anot in anot_list ] nuclei_detection_time = time.time() - start_time print('Number of nuclei = {}'.format(len(nuclei_list))) print('Nuclei detection time = {}'.format( cli_utils.disp_time_hms(nuclei_detection_time))) # # Write annotation file # print('\n>> Writing annotation file ...\n') annot_fname = os.path.splitext( os.path.basename(args.outputNucleiAnnotationFile))[0] annotation = { "name": annot_fname + '-nuclei-' + args.nuclei_annotation_format, "elements": nuclei_list } with open(args.outputNucleiAnnotationFile, 'w') as annotation_file: json.dump(annotation, annotation_file, indent=2, sort_keys=False) total_time_taken = time.time() - total_start_time print('Total analysis time = {}'.format( cli_utils.disp_time_hms(total_time_taken)))