def test_list_files(): with tempfile.TemporaryDirectory() as temp_dir: # set up temp_dir files filenames = [ 'tf.txt', 'othertf.txt', 'test.out', 'test.csv', ] for filename in filenames: pathlib.Path(os.path.join(temp_dir, filename)).touch() # add extra folder (shouldn't be picked up) os.mkdir(os.path.join(temp_dir, 'badfolder_test')) # test substrs is None (default) get_all = iou.list_files(temp_dir) assert get_all.sort() == filenames.sort() # test substrs is not list (single string) get_txt = iou.list_files(temp_dir, substrs='.txt') assert get_txt.sort() == filenames[0:2].sort() # test substrs is list get_test_and_other = iou.list_files(temp_dir, substrs=['test', 'other']) assert get_test_and_other.sort() == filenames[1:].sort()
def compute_complete_expression_matrices(segmentation_labels, tiff_dir, img_sub_folder, is_mibitiff=False, points=None, batch_size=5): """ This function takes the segmented data and computes the expression matrices batch-wise while also validating inputs Inputs: segmentation_labels (xarray): an xarray with the segmented data tiff_dir (str): the name of the directory which contains the single_channel_inputs img_sub_folder (str): the name of the folder where the TIF images are located points (list): a list of points we wish to analyze, if None will default to all points is_mibitiff (bool): a flag to indicate whether or not the base images are MIBItiffs mibitiff_suffix (str): if is_mibitiff is true, then needs to be specified to select which points to load from mibitiff batch_size (int): how large we want each of the batches of points to be when computing, adjust as necessary for speed and memory considerations Returns: combined_normalized_data (pandas): a DataFrame containing the size_norm transformed data combined_transformed_data (pandas): a DataFrame containing the arcsinh transformed data """ # if no points are specified, then load all the points if points is None: # handle mibitiffs with an assumed file structure if is_mibitiff: filenames = io_utils.list_files(tiff_dir, substrs=['.tif']) points = io_utils.extract_delimited_names(filenames, delimiter=None) # otherwise assume the tree-like directory as defined for tree loading else: filenames = io_utils.list_folders(tiff_dir) points = filenames # check segmentation_labels for given points (img loaders will fail otherwise) point_values = [ point for point in points if point not in segmentation_labels['fovs'].values ] if point_values: raise ValueError( f"Invalid point values specified: " f"points {','.join(point_values)} not found in segmentation_labels fovs" ) # get full filenames from given points filenames = io_utils.list_files(tiff_dir, substrs=points) # sort the points points.sort() filenames.sort() # defined some vars for batch processing cohort_len = len(points) # create the final dfs to store the processed data combined_cell_size_normalized_data = pd.DataFrame() combined_arcsinh_transformed_data = pd.DataFrame() # iterate over all the batches for batch_names, batch_files in zip([ points[i:i + batch_size] for i in range(0, cohort_len, batch_size) ], [filenames[i:i + batch_size] for i in range(0, cohort_len, batch_size)]): # and extract the image data for each batch if is_mibitiff: image_data = data_utils.load_imgs_from_mibitiff( data_dir=tiff_dir, mibitiff_files=batch_files) else: image_data = data_utils.load_imgs_from_tree( data_dir=tiff_dir, img_sub_folder=img_sub_folder, fovs=batch_names) # as well as the labels corresponding to each of them current_labels = segmentation_labels.loc[batch_names, :, :, :] # segment the imaging data cell_size_normalized_data, arcsinh_transformed_data = generate_expression_matrix( segmentation_labels=current_labels, image_data=image_data) # now append to the final dfs to return combined_cell_size_normalized_data = combined_cell_size_normalized_data.append( cell_size_normalized_data) combined_arcsinh_transformed_data = combined_arcsinh_transformed_data.append( arcsinh_transformed_data) return combined_cell_size_normalized_data, combined_arcsinh_transformed_data
def load_imgs_from_multitiff(data_dir, multitiff_files=None, channels=None, delimiter=None, dtype='int16'): """Load images from a series of multi-channel tiff files. This function takes a set of multi-channel tiff files and loads the images into an xarray. The type used to store the images will be the same as that of the images stored in the multi-channel tiff files. This function differs from `load_imgs_from_mibitiff` in that proprietary metadata is unneeded, which is usefull loading in more general multi-channel tiff files. images. Args: data_dir (str): directory containing multitiffs multitiff_files (list): list of multi-channel tiff files to load. If None, all multitiff files in data_dir are loaded. channels (list): optional list of channels to load. Unlike MIBItiff, this must be given as a numeric list of indices, since there is no metadata containing channel names. delimiter (str): optional delimiter-character/string which separate fov names from the rest of the file name. Default is None. dtype (str/type): optional specifier of image type. Overwritten with warning for float images Returns: img_xr (xr.DataArray): xarray with shape [fovs, x_dim, y_dim, channels] """ if not multitiff_files: multitiff_files = iou.list_files(data_dir, substrs=['.tif']) # extract fov names w/ delimiter agnosticism fovs = iou.extract_delimited_names(multitiff_files, delimiter=delimiter) multitiff_files = [ os.path.join(data_dir, mt_file) for mt_file in multitiff_files ] test_img = io.imread(multitiff_files[0], plugin='tifffile') # check to make sure that float dtype was supplied if image data is float data_dtype = test_img.dtype if np.issubdtype(data_dtype, np.floating): if not np.issubdtype(dtype, np.floating): warnings.warn( f"The supplied non-float dtype {dtype} was overwritten to {data_dtype}, " f"because the loaded images are floats") dtype = data_dtype # extract data img_data = [] for multitiff_file in multitiff_files: img_data.append(io.imread(multitiff_file, plugin='tifffile')) img_data = np.stack(img_data, axis=0) img_data = img_data.astype(dtype) if channels: img_data = img_data[:, :, :, channels] # create xarray with image data img_xr = xr.DataArray( img_data, coords=[ fovs, range(img_data.shape[1]), range(img_data.shape[2]), channels if channels else range(img_data.shape[3]) ], dims=["fovs", "rows", "cols", "channels"]) return img_xr
def load_imgs_from_dir(data_dir, imgdim_name='compartments', image_name='img_data', delimiter=None, dtype="int16", variable_sizes=False, force_ints=False): """Takes a set of images from a directory and loads them into an xarray based on filename prefixes. Args: data_dir (str): directory containing images imgdim_name (str): sets the name of the last dimension of the output xarray image_name (str): sets name of the last coordinate in the output xarray delimiter (str): character used to determine the file-prefix containging the fov name. Default is None. dtype (str/type): data type to load/store variable_sizes (bool): Dynamically determine image sizes and pad smaller imgs w/ zeros force_ints (bool): If dtype is an integer, forcefully convert float imgs to ints. Default is False. Returns: img_xr (xr.DataArray): xarray with shape [fovs, x_dim, y_dim, 1] """ imgs = iou.list_files(data_dir, substrs=['.tif', '.jpg', '.png']) # filter by delimiter presence if delimiter is not None: imgs = [img for img in imgs if delimiter in img] imgs.sort() if len(imgs) == 0: raise ValueError(f"No images found in directory, {data_dir}") test_img = io.imread(os.path.join(data_dir, imgs[0])) # check to make sure that float dtype was supplied if image data is float data_dtype = test_img.dtype if force_ints and np.issubdtype(dtype, np.integer): if not np.issubdtype(data_dtype, np.integer): warnings.warn( f"The the loaded {data_dtype} images were forcefully " f"overwritten with the supplied integer dtype {dtype}") elif np.issubdtype(data_dtype, np.floating): if not np.issubdtype(dtype, np.floating): warnings.warn( f"The supplied non-float dtype {dtype} was overwritten to {data_dtype}, " f"because the loaded images are floats") dtype = data_dtype if variable_sizes: img_data = np.zeros((len(imgs), 1024, 1024, 1), dtype=dtype) else: img_data = np.zeros( (len(imgs), test_img.shape[0], test_img.shape[1], 1), dtype=dtype) for img in range(len(imgs)): if variable_sizes: temp_img = io.imread(os.path.join(data_dir, imgs[img])) img_data[img, :temp_img.shape[0], :temp_img.shape[1], 0] = temp_img.astype(dtype) else: img_data[img, :, :, 0] = io.imread(os.path.join(data_dir, imgs[img])).astype(dtype) # check to make sure that dtype wasn't too small for range of data if np.min(img_data) < 0: raise ValueError( "Integer overflow from loading TIF image, try a larger dtype") if variable_sizes: row_coords, col_coords = range(1024), range(1024) else: row_coords, col_coords = range(test_img.shape[0]), range( test_img.shape[1]) # get fov name from imgs fovs = iou.extract_delimited_names(imgs, delimiter=delimiter) img_xr = xr.DataArray(img_data.astype(dtype), coords=[fovs, row_coords, col_coords, [image_name]], dims=["fovs", "rows", "cols", imgdim_name]) return img_xr
def load_imgs_from_tree(data_dir, img_sub_folder=None, fovs=None, channels=None, dtype="int16", variable_sizes=False): """Takes a set of imgs from a directory structure and loads them into an xarray. Args: data_dir (str): directory containing folders of images img_sub_folder (str): optional name of image sub-folder within each fov fovs (list): optional list of folders to load imgs from. Default loads all folders channels (list): optional list of imgs to load, otherwise loads all imgs dtype (str/type): dtype of array which will be used to store values variable_sizes (bool): if true, will pad loaded images with zeros to fit into array Returns: img_xr (xr.DataArray): xarray with shape [fovs, x_dim, y_dim, tifs] """ if fovs is None: # get all fovs fovs = iou.list_folders(data_dir) fovs.sort() if len(fovs) == 0: raise ValueError(f"No fovs found in directory, {data_dir}") if img_sub_folder is None: # no img_sub_folder, change to empty string to read directly from base folder img_sub_folder = "" # get imgs from first fov if no img names supplied if channels is None: channels = iou.list_files(os.path.join(data_dir, fovs[0], img_sub_folder), substrs=['.tif', '.jpg', '.png']) # if taking all channels from directory, sort them alphabetically channels.sort() # otherwise, fill channel names with correct file extension elif not all( [img.endswith(("tif", "tiff", "jpg", "png")) for img in channels]): channels = iou.list_files(os.path.join(data_dir, fovs[0], img_sub_folder), substrs=channels) if len(channels) == 0: raise ValueError("No images found in designated folder") test_img = io.imread( os.path.join(data_dir, fovs[0], img_sub_folder, channels[0])) # check to make sure that float dtype was supplied if image data is float data_dtype = test_img.dtype if np.issubdtype(data_dtype, np.floating): if not np.issubdtype(dtype, np.floating): warnings.warn( f"The supplied non-float dtype {dtype} was overwritten to {data_dtype}, " f"because the loaded images are floats") dtype = data_dtype if variable_sizes: img_data = np.zeros((len(fovs), 1024, 1024, len(channels)), dtype=dtype) else: img_data = np.zeros( (len(fovs), test_img.shape[0], test_img.shape[1], len(channels)), dtype=dtype) for fov in range(len(fovs)): for img in range(len(channels)): if variable_sizes: temp_img = io.imread( os.path.join(data_dir, fovs[fov], img_sub_folder, channels[img])) img_data[fov, :temp_img.shape[0], :temp_img.shape[1], img] = temp_img else: img_data[fov, :, :, img] = io.imread( os.path.join(data_dir, fovs[fov], img_sub_folder, channels[img])) # check to make sure that dtype wasn't too small for range of data if np.min(img_data) < 0: raise ValueError( "Integer overflow from loading TIF image, try a larger dtype") if variable_sizes: row_coords, col_coords = range(1024), range(1024) else: row_coords, col_coords = range(test_img.shape[0]), range( test_img.shape[1]) # remove .tif or .tiff from image name img_names = [os.path.splitext(img)[0] for img in channels] img_xr = xr.DataArray(img_data, coords=[fovs, row_coords, col_coords, img_names], dims=["fovs", "rows", "cols", "channels"]) return img_xr
def load_imgs_from_mibitiff(data_dir, mibitiff_files=None, channels=None, delimiter=None, dtype='int16'): """Load images from a series of MIBItiff files. This function takes a set of MIBItiff files and load the images into an xarray. The type used to store the images will be the same as that of the MIBIimages stored in the MIBItiff files. Args: data_dir (str): directory containing MIBItiffs mibitiff_files (list): list of MIBItiff files to load. If None, all MIBItiff files in data_dir are loaded. channels (list): optional list of channels to load. Defaults to `None`, in which case, all channels in the first MIBItiff are used. delimiter (str): optional delimiter-character/string which separate fov names from the rest of the file name. Defaults to None dtype (str/type): optional specifier of image type. Overwritten with warning for float images Returns: img_xr (xr.DataArray): xarray with shape [fovs, x_dim, y_dim, channels] """ if not mibitiff_files: mibitiff_files = iou.list_files(data_dir, substrs=['.tif']) # extract fov names w/ delimiter agnosticism fovs = iou.extract_delimited_names(mibitiff_files, delimiter=delimiter) mibitiff_files = [ os.path.join(data_dir, mt_file) for mt_file in mibitiff_files ] test_img = io.imread(mibitiff_files[0], plugin='tifffile') # check to make sure that float dtype was supplied if image data is float data_dtype = test_img.dtype if np.issubdtype(data_dtype, np.floating): if not np.issubdtype(dtype, np.floating): warnings.warn( f"The supplied non-float dtype {dtype} was overwritten to {data_dtype}, " f"because the loaded images are floats") dtype = data_dtype # if no channels specified, get them from first MIBItiff file if channels is None: channel_tuples = tiff.read(mibitiff_files[0]).channels channels = [channel_tuple[1] for channel_tuple in channel_tuples] # extract images from MIBItiff file img_data = [] for mibitiff_file in mibitiff_files: img_data.append(tiff.read(mibitiff_file)[channels]) img_data = np.stack(img_data, axis=0) img_data = img_data.astype(dtype) # create xarray with image data img_xr = xr.DataArray(img_data, coords=[ fovs, range(img_data[0].data.shape[0]), range(img_data[0].data.shape[1]), channels ], dims=["fovs", "rows", "cols", "channels"]) return img_xr
def create_deepcell_output(deepcell_input_dir, deepcell_output_dir, fovs=None, suffix='_feature_0', host='https://deepcell.org', job_type='multiplex'): """ Handles all of the necessary data manipulation for running deepcell tasks. Creates .zip files (to be used as input for DeepCell), calls run_deepcell_task method, and extracts zipped output files to the specified output location Args: deepcell_input_dir (str): Location of preprocessed files (assume deepcell_input_dir contains <fov>.tif for each fov in fovs list) deepcell_output_dir (str): Location to save DeepCell output (as .tif) fovs (list): List of fovs in preprocessing pipeline. if None, all .tif files in deepcell_input_dir will be considered as input fovs. Default: None suffix (str): Suffix for DeepCell output filename. e.g. for fovX, DeepCell output should be <fovX>+suffix.tif. Default: '_feature_0' host (str): Hostname and port for the kiosk-frontend API server Default: 'https://deepcell.org' job_type (str): Name of job workflow (multiplex, segmentation, tracking) Default: 'multiplex' Raises: ValueError: Raised if there is some fov X (from fovs list) s.t. the file <deepcell_input_dir>/fovX.tif does not exist """ if fovs is None: tifs = io_utils.list_files(deepcell_input_dir, substrs='.tif') fovs = io_utils.extract_delimited_names(tifs, delimiter='.') zip_path = os.path.join(deepcell_input_dir, 'fovs.zip') if os.path.isfile(zip_path): warnings.warn(f'{zip_path} will be overwritten.') with ZipFile(zip_path, 'w') as zipObj: for fov in fovs: filename = os.path.join(deepcell_input_dir, fov + '.tif') if not os.path.isfile(filename): raise ValueError('Could not find .tif file for %s. ' 'Invalid value for %s' % (fov, filename)) zipObj.write(filename, os.path.basename(filename)) run_deepcell_task(zip_path, deepcell_output_dir, host, job_type) os.remove(zip_path) # extract the .tif output zip_files = glob.glob(os.path.join(deepcell_output_dir, '*.zip')) zip_files.sort(key=os.path.getmtime) with ZipFile(zip_files[-1], 'r') as zipObj: zipObj.extractall(deepcell_output_dir) for fov in fovs: if fov + suffix + '.tif' not in zipObj.namelist(): warnings.warn( f'Deep Cell output file was not found for {fov}.')
jacc_mean = np.mean(jacc) metrics[seed] = {'tissue_stats': tissue_stats, 'platform_stats': platform_stats, 'jacc': jacc_mean} np.savez_compressed(os.path.join(benchmarking_dir, 'ilastik_metrics_jacc.npz'), **metrics) # Retrained cellpose # copy predictions cellpose_dir = os.path.join(base_dir, 'benchmarking_accuracy/cellpose_predictions') download_dir = os.path.join(base_dir, 'benchmarking_accuracy/cellpose_predictions_full') for seed in ['1', '2', '3']: save_dir = os.path.join(cellpose_dir, 'split{}'.format(seed)) if not os.path.exists(save_dir): os.makedirs(save_dir) masks = list_files(download_dir + '/split{}'.format(seed), 'masks') for mask in masks: shutil.move(os.path.join(download_dir, 'split{}'.format(seed), mask), os.path.join(save_dir, mask)) # load predictions into single npz for seed in ['1', '2', '3']: output_shape = np.load(npz_dir + '/20201018_multiplex_seed_{}_test_256x256.npz'.format(seed))['y'].shape output = np.zeros(output_shape) for i in range(output.shape[0]): zero_filled = str(i).zfill(4) img = io.imread(os.path.join(cellpose_dir, 'split{}/{}_img_cp_masks.tif'.format(seed, zero_filled))) output[i, :, :, 0] = img np.savez_compressed(os.path.join(cellpose_dir, 'split_{}_combined_masks.npz'.format(seed)), y=output)
# create scalebar that goes from 0 to 1 tissue_array[0, 0] = 1 g = sns.heatmap(data=platform_array, annot=True, vmin=0, cmap='Blues') plt.savefig(os.path.join(plot_dir, 'Figure_S3b_scalebar.pdf')) # Figure 3d data_dir = base_dir + 'Human_agreement/' folders = ['DCIS_2328', 'Eliot_Point17', 'P101_T3_T4_Point2', 'cHL_Point8908'] folder_names = ['DCIS_MIBI', 'Colon_IF', 'Esophagus_MIBI', 'Hodgekins_Vectra'] f1_list, tissue_list, annotator_list = [], [], [] for i in range(len(folders)): # get all of the human annotations folder_path = os.path.join(data_dir, folders[i], 'annotations') img_names = io_utils.list_files(folder_path, '.tiff') imgs = [] for img in img_names: current_img = io.imread(os.path.join(folder_path, img)) imgs.append(current_img) f1_scores_human = figures.calculate_human_f1_scores(image_list=imgs) tissue_name = folder_names[i] f1_list.extend(f1_scores_human) tissue_list.extend([tissue_name] * len(f1_scores_human)) annotator_list.extend(['human'] * len(f1_scores_human)) # compare algorithm pred_img = io.imread( os.path.join(data_dir, folders[i], 'segmentation_label.tiff')) pred_img = np.expand_dims(pred_img, axis=0) f1_scores_alg = figures.calculate_alg_f1_scores(image_list=imgs,