Python list_files示例，ark.utils.io_utils.list_files Python示例

示例#1

0

显示文件

文件： io_utils_test.py 项目： omerbt/ark-analysis

def test_list_files():
    with tempfile.TemporaryDirectory() as temp_dir:
        # set up temp_dir files
        filenames = [
            'tf.txt',
            'othertf.txt',
            'test.out',
            'test.csv',
        ]
        for filename in filenames:
            pathlib.Path(os.path.join(temp_dir, filename)).touch()

        # add extra folder (shouldn't be picked up)
        os.mkdir(os.path.join(temp_dir, 'badfolder_test'))

        # test substrs is None (default)
        get_all = iou.list_files(temp_dir)
        assert get_all.sort() == filenames.sort()

        # test substrs is not list (single string)
        get_txt = iou.list_files(temp_dir, substrs='.txt')
        assert get_txt.sort() == filenames[0:2].sort()

        # test substrs is list
        get_test_and_other = iou.list_files(temp_dir,
                                            substrs=['test', 'other'])
        assert get_test_and_other.sort() == filenames[1:].sort()

示例#2

0

显示文件

def compute_complete_expression_matrices(segmentation_labels,
                                         tiff_dir,
                                         img_sub_folder,
                                         is_mibitiff=False,
                                         points=None,
                                         batch_size=5):
    """
    This function takes the segmented data and computes the expression matrices batch-wise
    while also validating inputs

    Inputs:
        segmentation_labels (xarray): an xarray with the segmented data
        tiff_dir (str): the name of the directory which contains the single_channel_inputs
        img_sub_folder (str): the name of the folder where the TIF images are located
        points (list): a list of points we wish to analyze, if None will default to all points
        is_mibitiff (bool): a flag to indicate whether or not the base images are MIBItiffs
        mibitiff_suffix (str): if is_mibitiff is true, then needs to be specified to select
            which points to load from mibitiff
        batch_size (int): how large we want each of the batches of points to be when computing,
            adjust as necessary for speed and memory considerations

    Returns:
        combined_normalized_data (pandas): a DataFrame containing the size_norm transformed data
        combined_transformed_data (pandas): a DataFrame containing the arcsinh transformed data
    """

    # if no points are specified, then load all the points
    if points is None:
        # handle mibitiffs with an assumed file structure
        if is_mibitiff:
            filenames = io_utils.list_files(tiff_dir, substrs=['.tif'])
            points = io_utils.extract_delimited_names(filenames,
                                                      delimiter=None)
        # otherwise assume the tree-like directory as defined for tree loading
        else:
            filenames = io_utils.list_folders(tiff_dir)
            points = filenames

    # check segmentation_labels for given points (img loaders will fail otherwise)
    point_values = [
        point for point in points
        if point not in segmentation_labels['fovs'].values
    ]
    if point_values:
        raise ValueError(
            f"Invalid point values specified: "
            f"points {','.join(point_values)} not found in segmentation_labels fovs"
        )

    # get full filenames from given points
    filenames = io_utils.list_files(tiff_dir, substrs=points)

    # sort the points
    points.sort()
    filenames.sort()

    # defined some vars for batch processing
    cohort_len = len(points)

    # create the final dfs to store the processed data
    combined_cell_size_normalized_data = pd.DataFrame()
    combined_arcsinh_transformed_data = pd.DataFrame()

    # iterate over all the batches
    for batch_names, batch_files in zip([
            points[i:i + batch_size] for i in range(0, cohort_len, batch_size)
    ], [filenames[i:i + batch_size]
            for i in range(0, cohort_len, batch_size)]):
        # and extract the image data for each batch
        if is_mibitiff:
            image_data = data_utils.load_imgs_from_mibitiff(
                data_dir=tiff_dir, mibitiff_files=batch_files)
        else:
            image_data = data_utils.load_imgs_from_tree(
                data_dir=tiff_dir,
                img_sub_folder=img_sub_folder,
                fovs=batch_names)

        # as well as the labels corresponding to each of them
        current_labels = segmentation_labels.loc[batch_names, :, :, :]

        # segment the imaging data
        cell_size_normalized_data, arcsinh_transformed_data = generate_expression_matrix(
            segmentation_labels=current_labels, image_data=image_data)

        # now append to the final dfs to return
        combined_cell_size_normalized_data = combined_cell_size_normalized_data.append(
            cell_size_normalized_data)
        combined_arcsinh_transformed_data = combined_arcsinh_transformed_data.append(
            arcsinh_transformed_data)

    return combined_cell_size_normalized_data, combined_arcsinh_transformed_data

示例#3

0

显示文件

文件： data_utils.py 项目： jason-weirather/ark-analysis

def load_imgs_from_multitiff(data_dir,
                             multitiff_files=None,
                             channels=None,
                             delimiter=None,
                             dtype='int16'):
    """Load images from a series of multi-channel tiff files.

    This function takes a set of multi-channel tiff files and loads the images
    into an xarray.  The type used to store the images will be the same as
    that of the images stored in the multi-channel tiff files.

    This function differs from `load_imgs_from_mibitiff` in that proprietary
    metadata is unneeded, which is usefull loading in more general multi-channel
    tiff files.
    images.

    Args:
        data_dir (str): directory containing multitiffs
        multitiff_files (list): list of multi-channel tiff files to load.  If None,
            all multitiff files in data_dir are loaded.
        channels (list): optional list of channels to load.  Unlike MIBItiff, this must
            be given as a numeric list of indices, since there is no metadata
            containing channel names.
        delimiter (str): optional delimiter-character/string which separate fov names
            from the rest of the file name. Default is None.
        dtype (str/type): optional specifier of image type.  Overwritten with warning for
            float images

    Returns:
        img_xr (xr.DataArray): xarray with shape [fovs, x_dim, y_dim, channels]
    """

    if not multitiff_files:
        multitiff_files = iou.list_files(data_dir, substrs=['.tif'])

    # extract fov names w/ delimiter agnosticism
    fovs = iou.extract_delimited_names(multitiff_files, delimiter=delimiter)

    multitiff_files = [
        os.path.join(data_dir, mt_file) for mt_file in multitiff_files
    ]

    test_img = io.imread(multitiff_files[0], plugin='tifffile')

    # check to make sure that float dtype was supplied if image data is float
    data_dtype = test_img.dtype
    if np.issubdtype(data_dtype, np.floating):
        if not np.issubdtype(dtype, np.floating):
            warnings.warn(
                f"The supplied non-float dtype {dtype} was overwritten to {data_dtype}, "
                f"because the loaded images are floats")
            dtype = data_dtype

    # extract data
    img_data = []
    for multitiff_file in multitiff_files:
        img_data.append(io.imread(multitiff_file, plugin='tifffile'))
    img_data = np.stack(img_data, axis=0)
    img_data = img_data.astype(dtype)

    if channels:
        img_data = img_data[:, :, :, channels]

    # create xarray with image data
    img_xr = xr.DataArray(
        img_data,
        coords=[
            fovs,
            range(img_data.shape[1]),
            range(img_data.shape[2]),
            channels if channels else range(img_data.shape[3])
        ],
        dims=["fovs", "rows", "cols", "channels"])

    return img_xr

示例#4

0

显示文件

文件： data_utils.py 项目： jason-weirather/ark-analysis

def load_imgs_from_dir(data_dir,
                       imgdim_name='compartments',
                       image_name='img_data',
                       delimiter=None,
                       dtype="int16",
                       variable_sizes=False,
                       force_ints=False):
    """Takes a set of images from a directory and loads them into an xarray based on filename
    prefixes.

        Args:
            data_dir (str): directory containing images
            imgdim_name (str): sets the name of the last dimension of the output xarray
            image_name (str): sets name of the last coordinate in the output xarray
            delimiter (str): character used to determine the file-prefix containging the fov name.
                             Default is None.
            dtype (str/type): data type to load/store
            variable_sizes (bool): Dynamically determine image sizes and pad smaller imgs w/ zeros
            force_ints (bool): If dtype is an integer, forcefully convert float imgs to ints.
                               Default is False.

        Returns:
            img_xr (xr.DataArray): xarray with shape [fovs, x_dim, y_dim, 1]

    """

    imgs = iou.list_files(data_dir, substrs=['.tif', '.jpg', '.png'])

    # filter by delimiter presence
    if delimiter is not None:
        imgs = [img for img in imgs if delimiter in img]

    imgs.sort()

    if len(imgs) == 0:
        raise ValueError(f"No images found in directory, {data_dir}")

    test_img = io.imread(os.path.join(data_dir, imgs[0]))

    # check to make sure that float dtype was supplied if image data is float
    data_dtype = test_img.dtype
    if force_ints and np.issubdtype(dtype, np.integer):
        if not np.issubdtype(data_dtype, np.integer):
            warnings.warn(
                f"The the loaded {data_dtype} images were forcefully "
                f"overwritten with the supplied integer dtype {dtype}")
    elif np.issubdtype(data_dtype, np.floating):
        if not np.issubdtype(dtype, np.floating):
            warnings.warn(
                f"The supplied non-float dtype {dtype} was overwritten to {data_dtype}, "
                f"because the loaded images are floats")
            dtype = data_dtype

    if variable_sizes:
        img_data = np.zeros((len(imgs), 1024, 1024, 1), dtype=dtype)
    else:
        img_data = np.zeros(
            (len(imgs), test_img.shape[0], test_img.shape[1], 1), dtype=dtype)

    for img in range(len(imgs)):
        if variable_sizes:
            temp_img = io.imread(os.path.join(data_dir, imgs[img]))
            img_data[img, :temp_img.shape[0], :temp_img.shape[1],
                     0] = temp_img.astype(dtype)
        else:
            img_data[img, :, :,
                     0] = io.imread(os.path.join(data_dir,
                                                 imgs[img])).astype(dtype)

    # check to make sure that dtype wasn't too small for range of data
    if np.min(img_data) < 0:
        raise ValueError(
            "Integer overflow from loading TIF image, try a larger dtype")

    if variable_sizes:
        row_coords, col_coords = range(1024), range(1024)
    else:
        row_coords, col_coords = range(test_img.shape[0]), range(
            test_img.shape[1])

    # get fov name from imgs
    fovs = iou.extract_delimited_names(imgs, delimiter=delimiter)

    img_xr = xr.DataArray(img_data.astype(dtype),
                          coords=[fovs, row_coords, col_coords, [image_name]],
                          dims=["fovs", "rows", "cols", imgdim_name])

    return img_xr

示例#5

0

显示文件

文件： data_utils.py 项目： jason-weirather/ark-analysis

def load_imgs_from_tree(data_dir,
                        img_sub_folder=None,
                        fovs=None,
                        channels=None,
                        dtype="int16",
                        variable_sizes=False):
    """Takes a set of imgs from a directory structure and loads them into an xarray.

        Args:
            data_dir (str): directory containing folders of images
            img_sub_folder (str): optional name of image sub-folder within each fov
            fovs (list): optional list of folders to load imgs from. Default loads all folders
            channels (list): optional list of imgs to load, otherwise loads all imgs
            dtype (str/type): dtype of array which will be used to store values
            variable_sizes (bool): if true, will pad loaded images with zeros to fit into array

        Returns:
            img_xr (xr.DataArray): xarray with shape [fovs, x_dim, y_dim, tifs]
    """

    if fovs is None:
        # get all fovs
        fovs = iou.list_folders(data_dir)
        fovs.sort()

    if len(fovs) == 0:
        raise ValueError(f"No fovs found in directory, {data_dir}")

    if img_sub_folder is None:
        # no img_sub_folder, change to empty string to read directly from base folder
        img_sub_folder = ""

    # get imgs from first fov if no img names supplied
    if channels is None:
        channels = iou.list_files(os.path.join(data_dir, fovs[0],
                                               img_sub_folder),
                                  substrs=['.tif', '.jpg', '.png'])

        # if taking all channels from directory, sort them alphabetically
        channels.sort()
    # otherwise, fill channel names with correct file extension
    elif not all(
        [img.endswith(("tif", "tiff", "jpg", "png")) for img in channels]):
        channels = iou.list_files(os.path.join(data_dir, fovs[0],
                                               img_sub_folder),
                                  substrs=channels)

    if len(channels) == 0:
        raise ValueError("No images found in designated folder")

    test_img = io.imread(
        os.path.join(data_dir, fovs[0], img_sub_folder, channels[0]))

    # check to make sure that float dtype was supplied if image data is float
    data_dtype = test_img.dtype
    if np.issubdtype(data_dtype, np.floating):
        if not np.issubdtype(dtype, np.floating):
            warnings.warn(
                f"The supplied non-float dtype {dtype} was overwritten to {data_dtype}, "
                f"because the loaded images are floats")
            dtype = data_dtype

    if variable_sizes:
        img_data = np.zeros((len(fovs), 1024, 1024, len(channels)),
                            dtype=dtype)
    else:
        img_data = np.zeros(
            (len(fovs), test_img.shape[0], test_img.shape[1], len(channels)),
            dtype=dtype)

    for fov in range(len(fovs)):
        for img in range(len(channels)):
            if variable_sizes:
                temp_img = io.imread(
                    os.path.join(data_dir, fovs[fov], img_sub_folder,
                                 channels[img]))
                img_data[fov, :temp_img.shape[0], :temp_img.shape[1],
                         img] = temp_img
            else:
                img_data[fov, :, :, img] = io.imread(
                    os.path.join(data_dir, fovs[fov], img_sub_folder,
                                 channels[img]))

    # check to make sure that dtype wasn't too small for range of data
    if np.min(img_data) < 0:
        raise ValueError(
            "Integer overflow from loading TIF image, try a larger dtype")

    if variable_sizes:
        row_coords, col_coords = range(1024), range(1024)
    else:
        row_coords, col_coords = range(test_img.shape[0]), range(
            test_img.shape[1])

    # remove .tif or .tiff from image name
    img_names = [os.path.splitext(img)[0] for img in channels]

    img_xr = xr.DataArray(img_data,
                          coords=[fovs, row_coords, col_coords, img_names],
                          dims=["fovs", "rows", "cols", "channels"])

    return img_xr

示例#6

0

显示文件

文件： data_utils.py 项目： jason-weirather/ark-analysis

def load_imgs_from_mibitiff(data_dir,
                            mibitiff_files=None,
                            channels=None,
                            delimiter=None,
                            dtype='int16'):
    """Load images from a series of MIBItiff files.

    This function takes a set of MIBItiff files and load the images into an
    xarray. The type used to store the images will be the same as that of the
    MIBIimages stored in the MIBItiff files.

    Args:
        data_dir (str): directory containing MIBItiffs
        mibitiff_files (list): list of MIBItiff files to load. If None,
            all MIBItiff files in data_dir are loaded.
        channels (list): optional list of channels to load. Defaults to `None`, in
            which case, all channels in the first MIBItiff are used.
        delimiter (str): optional delimiter-character/string which separate fov names
            from the rest of the file name. Defaults to None
        dtype (str/type): optional specifier of image type.  Overwritten with warning for
            float images

    Returns:
        img_xr (xr.DataArray): xarray with shape [fovs, x_dim, y_dim, channels]
    """

    if not mibitiff_files:
        mibitiff_files = iou.list_files(data_dir, substrs=['.tif'])

    # extract fov names w/ delimiter agnosticism
    fovs = iou.extract_delimited_names(mibitiff_files, delimiter=delimiter)

    mibitiff_files = [
        os.path.join(data_dir, mt_file) for mt_file in mibitiff_files
    ]

    test_img = io.imread(mibitiff_files[0], plugin='tifffile')

    # check to make sure that float dtype was supplied if image data is float
    data_dtype = test_img.dtype
    if np.issubdtype(data_dtype, np.floating):
        if not np.issubdtype(dtype, np.floating):
            warnings.warn(
                f"The supplied non-float dtype {dtype} was overwritten to {data_dtype}, "
                f"because the loaded images are floats")
            dtype = data_dtype

    # if no channels specified, get them from first MIBItiff file
    if channels is None:
        channel_tuples = tiff.read(mibitiff_files[0]).channels
        channels = [channel_tuple[1] for channel_tuple in channel_tuples]

    # extract images from MIBItiff file
    img_data = []
    for mibitiff_file in mibitiff_files:
        img_data.append(tiff.read(mibitiff_file)[channels])
    img_data = np.stack(img_data, axis=0)
    img_data = img_data.astype(dtype)

    # create xarray with image data
    img_xr = xr.DataArray(img_data,
                          coords=[
                              fovs,
                              range(img_data[0].data.shape[0]),
                              range(img_data[0].data.shape[1]), channels
                          ],
                          dims=["fovs", "rows", "cols", "channels"])

    return img_xr

示例#7

0

显示文件

def create_deepcell_output(deepcell_input_dir,
                           deepcell_output_dir,
                           fovs=None,
                           suffix='_feature_0',
                           host='https://deepcell.org',
                           job_type='multiplex'):
    """ Handles all of the necessary data manipulation for running deepcell tasks.

        Creates .zip files (to be used as input for DeepCell),
        calls run_deepcell_task method,
        and extracts zipped output files to the specified output location

        Args:
            deepcell_input_dir (str):
                Location of preprocessed files (assume deepcell_input_dir contains <fov>.tif
                for each fov in fovs list)
            deepcell_output_dir (str):
                Location to save DeepCell output (as .tif)
            fovs (list):
                List of fovs in preprocessing pipeline. if None, all .tif files
                in deepcell_input_dir will be considered as input fovs. Default: None
            suffix (str):
                Suffix for DeepCell output filename. e.g. for fovX, DeepCell output
                should be <fovX>+suffix.tif. Default: '_feature_0'
            host (str):
                Hostname and port for the kiosk-frontend API server
                Default: 'https://deepcell.org'
            job_type (str):
                Name of job workflow (multiplex, segmentation, tracking)
                Default: 'multiplex'
        Raises:
            ValueError:
                Raised if there is some fov X (from fovs list) s.t.
                the file <deepcell_input_dir>/fovX.tif does not exist
        """
    if fovs is None:
        tifs = io_utils.list_files(deepcell_input_dir, substrs='.tif')
        fovs = io_utils.extract_delimited_names(tifs, delimiter='.')

    zip_path = os.path.join(deepcell_input_dir, 'fovs.zip')
    if os.path.isfile(zip_path):
        warnings.warn(f'{zip_path} will be overwritten.')

    with ZipFile(zip_path, 'w') as zipObj:
        for fov in fovs:
            filename = os.path.join(deepcell_input_dir, fov + '.tif')
            if not os.path.isfile(filename):
                raise ValueError('Could not find .tif file for %s. '
                                 'Invalid value for %s' % (fov, filename))
            zipObj.write(filename, os.path.basename(filename))

    run_deepcell_task(zip_path, deepcell_output_dir, host, job_type)
    os.remove(zip_path)

    # extract the .tif output
    zip_files = glob.glob(os.path.join(deepcell_output_dir, '*.zip'))
    zip_files.sort(key=os.path.getmtime)
    with ZipFile(zip_files[-1], 'r') as zipObj:
        zipObj.extractall(deepcell_output_dir)
        for fov in fovs:
            if fov + suffix + '.tif' not in zipObj.namelist():
                warnings.warn(
                    f'Deep Cell output file was not found for {fov}.')

示例#8

0

显示文件

    jacc_mean = np.mean(jacc)
    metrics[seed] = {'tissue_stats': tissue_stats, 'platform_stats': platform_stats,
                     'jacc': jacc_mean}

np.savez_compressed(os.path.join(benchmarking_dir, 'ilastik_metrics_jacc.npz'), **metrics)


# Retrained cellpose
# copy predictions
cellpose_dir = os.path.join(base_dir, 'benchmarking_accuracy/cellpose_predictions')
download_dir = os.path.join(base_dir, 'benchmarking_accuracy/cellpose_predictions_full')
for seed in ['1', '2', '3']:
    save_dir = os.path.join(cellpose_dir, 'split{}'.format(seed))
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    masks = list_files(download_dir + '/split{}'.format(seed), 'masks')
    for mask in masks:
        shutil.move(os.path.join(download_dir, 'split{}'.format(seed), mask),
                    os.path.join(save_dir, mask))

# load predictions into single npz
for seed in ['1', '2', '3']:
    output_shape = np.load(npz_dir + '/20201018_multiplex_seed_{}_test_256x256.npz'.format(seed))['y'].shape
    output = np.zeros(output_shape)
    for i in range(output.shape[0]):
        zero_filled = str(i).zfill(4)
        img = io.imread(os.path.join(cellpose_dir, 'split{}/{}_img_cp_masks.tif'.format(seed, zero_filled)))
        output[i, :, :, 0] = img
    np.savez_compressed(os.path.join(cellpose_dir, 'split_{}_combined_masks.npz'.format(seed)),
                        y=output)

示例#9

0

显示文件

# create scalebar that goes from 0 to 1
tissue_array[0, 0] = 1
g = sns.heatmap(data=platform_array, annot=True, vmin=0, cmap='Blues')
plt.savefig(os.path.join(plot_dir, 'Figure_S3b_scalebar.pdf'))

# Figure 3d
data_dir = base_dir + 'Human_agreement/'
folders = ['DCIS_2328', 'Eliot_Point17', 'P101_T3_T4_Point2', 'cHL_Point8908']
folder_names = ['DCIS_MIBI', 'Colon_IF', 'Esophagus_MIBI', 'Hodgekins_Vectra']
f1_list, tissue_list, annotator_list = [], [], []

for i in range(len(folders)):
    # get all of the human annotations
    folder_path = os.path.join(data_dir, folders[i], 'annotations')
    img_names = io_utils.list_files(folder_path, '.tiff')
    imgs = []
    for img in img_names:
        current_img = io.imread(os.path.join(folder_path, img))
        imgs.append(current_img)
    f1_scores_human = figures.calculate_human_f1_scores(image_list=imgs)
    tissue_name = folder_names[i]
    f1_list.extend(f1_scores_human)
    tissue_list.extend([tissue_name] * len(f1_scores_human))
    annotator_list.extend(['human'] * len(f1_scores_human))

    # compare algorithm
    pred_img = io.imread(
        os.path.join(data_dir, folders[i], 'segmentation_label.tiff'))
    pred_img = np.expand_dims(pred_img, axis=0)
    f1_scores_alg = figures.calculate_alg_f1_scores(image_list=imgs,