def get_panorama_images(mcd_file: Path, output_file_prefix: Path = None, overwrite: bool = False) -> Optional[List[Array]]: import imageio byteoffset = 161 mcd = McdParser(mcd_file) imgs = list() for slide in mcd.session.metadata["Panorama"]: start, end = ( int(slide["ImageStartOffset"]), int(slide["ImageEndOffset"]), ) img = mcd._get_buffer(start + byteoffset, end + byteoffset) if len(img) == 0: # empty image continue if output_file_prefix is not None: output_file = output_file_prefix + f"_{slide['ID']}.png" if overwrite or (not output_file.exists()): with open(output_file, "wb") as f: f.write(img) else: try: imgs.append(imageio.imread(img)) except ValueError: continue mcd.close() if output_file_prefix is None: return imgs else: return None
class McdFileReader(FileReaderBase): def __init__(self, path: Union[str, Path]): super(McdFileReader, self).__init__(path) self._mcd_parser: Optional[McdParser] = None def _get_imc_file_panoramas(self, imc_file: IMCFileModel) -> List[IMCFilePanoramaModel]: return [ IMCFilePanoramaModel(imc_file, panorama.id, panorama.image_type, panorama.description) for panorama in self._mcd_parser.session.panoramas.values() if panorama.image_type != 'Default' ] def _get_imc_file_acquisitions(self, imc_file: IMCFileModel) -> List[IMCFileAcquisitionModel]: return [ IMCFileAcquisitionModel(imc_file, acquisition.id, acquisition.description, acquisition.channel_labels) for acquisition in self._mcd_parser.session.acquisitions.values() if acquisition.is_valid ] def read_panorama(self, panorama_id: int) -> Tuple[ImageDimensions, np.ndarray]: panorama = self._mcd_parser.session.panoramas[panorama_id] xs_physical = [panorama.x1, panorama.x2, panorama.x3, panorama.x4] ys_physical = [panorama.y1, panorama.y2, panorama.y3, panorama.y4] x_physical, y_physical = min(xs_physical), min(ys_physical) w_physical, h_physical = max(xs_physical) - x_physical, max(ys_physical) - y_physical data = imread(self._mcd_parser.get_panorama_image(panorama_id)) if x_physical != panorama.x1: data = data[:, ::-1, :] if y_physical != panorama.y1: data = data[::-1, :, :] return (x_physical, y_physical, w_physical, h_physical), data def read_acquisition(self, acquisition_id: int, channel_label: str) -> Tuple[ImageDimensions, np.ndarray]: acquisition = self._mcd_parser.session.acquisitions[acquisition_id] xs_physical = [acquisition.roi_start_x_pos_um, acquisition.roi_end_x_pos_um] ys_physical = [acquisition.roi_start_y_pos_um, acquisition.roi_end_y_pos_um] x_physical, y_physical = min(xs_physical), min(ys_physical) w_physical, h_physical = max(xs_physical) - x_physical, max(ys_physical) - y_physical data = self._mcd_parser.get_acquisition_data(acquisition.id).get_image_by_label(channel_label) if x_physical != acquisition.roi_start_x_pos_um: data = data[:, ::-1] if y_physical != acquisition.roi_start_y_pos_um: data = data[::-1, :] return (x_physical, y_physical, w_physical, h_physical), data def __enter__(self) -> 'FileReaderBase': self._mcd_parser = McdParser(self._path) return self def __exit__(self, exc_type, exc_val, exc_tb): self._mcd_parser.close() @classmethod def accepts(cls, path: Union[str, Path]) -> bool: return Path(path).suffix.lower() == '.mcd'
def inspect_mcd(mcd_file: Path, args: Args) -> Tuple[DataFrame, DataFrame]: cols = [ "Target", "Metal_Tag", "Atom", "full", "ilastik", ] exclude_channels = ["EMPTY", "190BCKG", "80Ar", "89Y", "127I", "124Xe"] print(f"Started analyzing '{mcd_file}'!") mcd = McdParser(mcd_file) session = mcd.session # get channel labels ac_ids = session.acquisition_ids labels = pd.DataFrame({ ac_id: cleanup_channel_names(session.acquisitions[ac_id].channel_labels) for ac_id in ac_ids }) metals = pd.DataFrame( {ac_id: session.acquisitions[ac_id].channel_names for ac_id in ac_ids}) channel_names = labels.replace({None: "<EMPTY>"}) + "(" + metals + ")" same_channels = bool(channel_names.nunique(1).replace( 0, 1).all()) # np.bool is not serializable if same_channels: print("\t * All ROIs have the same markers/metals.") ch = channel_names.iloc[:, 0].rename("channel") ids = ch.str.extract(r"(?P<Target>.*)\((?P<Metal_Tag>.*)\)") ids.index = ch annot = pd.DataFrame(ids, columns=cols) annot["Atom"] = annot["Metal_Tag"].str.extract(r"(\d+)")[0] annot["full"] = ( ~annot.index.str.contains("|".join(exclude_channels))).astype(int) annot["ilastik"] = (annot.index.str.contains("DNA") | annot.index.str.startswith("CD")).astype(int) if not args.no_write: annot.to_csv(mcd_file.replace_(".mcd", ".channel_labels.csv")) else: annot = pd.DataFrame(columns=cols) print("\t * ROIs have different markers/metals.") # Save some metadata meta = session.get_csv_dict() meta["n_slides"] = len(session.slides) print(f"\t * Contains {meta['n_slides']} slides.") meta["n_panoramas"] = len(session.panoramas) print(f"\t * Contains {meta['n_panoramas']} panoramas.") meta["n_ROIs"] = len(session.acquisition_ids) print(f"\t * Contains {meta['n_ROIs']} ROIs.") meta["ROI_numbers"] = session.acquisition_ids meta["all_ROIs_same_channels"] = same_channels meta["consensus_channels"] = (channel_names.iloc[:, 0].to_dict() if same_channels else None) meta["panoramas"] = { p: v.get_csv_dict() for p, v in session.panoramas.items() } meta["acquisitions"] = { a: ac.get_csv_dict() for a, ac in session.acquisitions.items() } meta.update(session.metadata) if not args.no_write: yaml.dump( encode(meta), open(mcd_file.replace_(".mcd", ".session_metadata.yaml"), "w"), indent=4, default_flow_style=False, sort_keys=False, ) mcd.close() print(f"Finished with '{mcd_file}'!") return meta, annot
def mcdfolder_to_imcfolder(input: Union[str, Path], output_folder: Union[str, Path], create_zip: bool = False, parse_txt: bool = False): """Converts folder (or zipped folder) containing raw acquisition data (mcd and txt files) to IMC folder containing standardized files. Parameters ---------- input Input folder / .zip file with raw .mcd/.txt acquisition data files. output_folder Path to the output folder. create_zip Whether to create an output as .zip file. parse_txt Always use TXT files if present to get acquisition image data. """ if isinstance(input, str): input = Path(input) tmpdir = None if input.is_file() and input.suffix == ZIP_FILENDING: tmpdir = TemporaryDirectory() with zipfile.ZipFile(input, allowZip64=True) as zip: zip.extractall(tmpdir.name) input_folder = Path(tmpdir.name) else: input_folder = input mcd_parser = None try: mcd_files = list(input_folder.rglob(f"*{MCD_FILENDING}")) mcd_files = [f for f in mcd_files if not f.name.startswith(".")] assert len(mcd_files) == 1 input_folder = mcd_files[0].parent schema_files = glob.glob(str(input_folder / f"*{SCHEMA_FILENDING}")) schema_file = schema_files[0] if len(schema_files) > 0 else None try: mcd_parser = McdParser(mcd_files[0]) except: if schema_file is not None: logging.error( "MCD file is corrupted, trying to rescue with schema file") mcd_parser = McdParser(mcd_files[0], xml_metadata_filepath=schema_file) else: raise txt_files = glob.glob(str(input_folder / f"*[0-9]{TXT_FILE_EXTENSION}")) txt_acquisitions_map = { TxtParser.extract_acquisition_id(f): f for f in txt_files } imc_writer = ImcWriter(output_folder, mcd_parser, txt_acquisitions_map, parse_txt) imc_writer.write_imc_folder(create_zip=create_zip) finally: if mcd_parser is not None: mcd_parser.close() if tmpdir is not None: tmpdir.cleanup()
def mcd_to_dir( mcd_file: Path, pannel_csv: Path = None, ilastik_output: bool = True, ilastik_channels: List[str] = None, output_dir: Path = None, output_format: str = "tiff", overwrite: bool = False, sample_name: str = None, partition_panels: bool = False, filter_full: bool = True, export_panoramas: bool = True, keep_original_roi_names: bool = False, allow_empty_rois: bool = True, only_crops: bool = False, n_crops: int = 5, crop_width: int = 500, crop_height: int = 500, ) -> None: def get_dataframe_from_channels(mcd): return pd.DataFrame( [mcd.get_acquisition_channels(x) for x in session.acquisition_ids], index=session.acquisition_ids, ) def all_channels_equal(mcd): chs = get_dataframe_from_channels(mcd) return all([(chs[c].value_counts() == mcd.n_acquisitions).all() for c in chs.columns]) def get_panel_partitions(mcd): chs = get_dataframe_from_channels(mcd) partitions = {k: set(k) for k in chs.drop_duplicates().index} for p in partitions: for _, row in chs.iterrows(): print(p, row.name) if (row == chs.loc[list(partitions[p])[0]]).all(): partitions[p] = partitions[p].union(set([row.name])) return partitions.values() def clip_hot_pixels(img, hp_filter_shape=(3, 3), hp_threshold=0.0001): if hp_filter_shape[0] % 2 != 1 or hp_filter_shape[1] % 2 != 1: raise ValueError("Invalid hot pixel filter shape: %s" % str(hp_filter_shape)) hp_filter_footprint = np.ones(hp_filter_shape) hp_filter_footprint[int(hp_filter_shape[0] / 2), int(hp_filter_shape[1] / 2)] = 0 max_img = ndi.maximum_filter(img, footprint=hp_filter_footprint, mode="reflect") hp_mask = img - max_img > hp_threshold img = img.copy() img[hp_mask] = max_img[hp_mask] return img if partition_panels: raise NotImplementedError( "Partitioning sample per panel is not implemented yet.") if pannel_csv is None and ilastik_channels is None: raise ValueError( "One of `pannel_csv` or `ilastik_channels` must be given!") if ilastik_channels is None and pannel_csv is not None: panel = pd.read_csv(pannel_csv, index_col=0) ilastik_channels = panel.query("ilastik == 1").index.tolist() H5_YXC_AXISTAG = json.dumps({ "axes": [ { "key": "y", "typeFlags": 2, "resolution": 0, "description": "", }, { "key": "x", "typeFlags": 2, "resolution": 0, "description": "", }, { "key": "c", "typeFlags": 1, "resolution": 0, "description": "", }, ] }) if output_dir is None: output_dir = mcd_file.parent / "imc_dir" output_dir.mkdir(exist_ok=True, parents=True) dirs = ["tiffs"] + (["ilastik"] if ilastik_output else []) for _dir in dirs: (output_dir / _dir).mkdir(exist_ok=True) # Export panoramas if export_panoramas: get_panorama_images( mcd_file, output_file_prefix=output_dir / "Panorama", overwrite=overwrite, ) # Parse MCD mcd = McdParser(mcd_file) session = mcd.session if sample_name is None: sample_name = session.name for i, ac_id in enumerate(session.acquisition_ids): print(ac_id, end="\t") try: ac = mcd.get_acquisition_data(ac_id) except Exception as e: # imctools.io.abstractparserbase.AcquisitionError if allow_empty_rois: print(e) continue raise e # Get output prefix if keep_original_roi_names: prefix = (output_dir / "tiffs" / (session.name.replace(" ", "_") + "_ac")) else: prefix = (output_dir / "tiffs" / (sample_name + "-" + str(i + 1).zfill(2))) # Skip if not overwrite file_ending = "ome.tiff" if output_format == "ome-tiff" else "tiff" if (prefix + "_full." + file_ending).exists() and not overwrite: print( "TIFF images exist and overwrite is set to `False`. Continuing." ) continue # Filter channels channel_labels = build_channel_name(ac.channel_labels, ac.channel_names) to_exp = channel_labels[channel_labels.isin(ilastik_channels)] to_exp_ind = [ ac.channel_masses.index(y) for y in to_exp.str.extract(r".*\(..(\d+)\)")[0] ] assert to_exp_ind == to_exp.index.tolist() if filter_full: # remove background and empty channels # TODO: find way to do this more systematically channel_labels = channel_labels[~( channel_labels.str.contains(r"^\d") | channel_labels.str.contains("<EMPTY>"))].reset_index( drop=True) # Filter hot pixels ac._image_data = np.asarray( [clip_hot_pixels(x) for x in ac.image_data]) # Save full image if not only_crops: p = prefix + "_full." if output_format == "tiff": if (overwrite) or not (p + file_ending).exists(): ac.save_tiff( p + file_ending, names=channel_labels.str.extract(r"\((.*)\)")[0], ) elif output_format == "ome-tiff": if (overwrite) or not (p + file_ending).exists(): ac.save_ome_tiff( p + file_ending, names=channel_labels.str.extract(r"\((.*)\)")[0], xml_metadata=mcd.get_mcd_xml(), ) # Save channel labels for the stack if (overwrite) or not (p + "csv").exists(): channel_labels.to_csv(p + "csv") if not ilastik_output: continue # Make input for ilastik training # # zoom 2x s = tuple(x * 2 for x in ac.image_data.shape[1:]) full = np.moveaxis( np.asarray([resize(x, s) for x in ac.image_data[to_exp_ind]]), 0, -1) # # Save input for ilastik prediction with h5py.File(prefix + "_ilastik_s2.h5", mode="w") as handle: d = handle.create_dataset("stacked_channels", data=full) d.attrs["axistags"] = H5_YXC_AXISTAG # # random crops iprefix = (output_dir / "ilastik" / (sample_name.replace(" ", "_") + "_ac")) # # # make sure height/width are smaller or equal to acquisition dimensions if (full.shape[1] < crop_width) or (full.shape[0] < crop_height): msg = "Image is smaller than the requested crop size for ilastik training." print(msg) continue for _ in range(n_crops): x = np.random.choice(range(s[0] - crop_width)) y = np.random.choice(range(s[1] - crop_height)) crop = full[x:(x + crop_width), y:(y + crop_height), :] assert crop.shape == (crop_width, crop_height, len(to_exp)) with h5py.File( iprefix + f"_ilastik_x{x}_y{y}_w{crop_width}_h{crop_height}.h5", mode="w", ) as handle: d = handle.create_dataset("stacked_channels", data=crop) d.attrs["axistags"] = H5_YXC_AXISTAG print("") # add a newline to the tabs mcd.close()