def imzml_to_sbd(filepath_imzml, filepath_sbd): """Converts a pair of .imzml and .ibd files to .sbd Returns: list:True on success """ with open(filepath_sbd, 'wb') as out_file: p = ImzMLParser(filepath_imzml) n_spectra = len(p.coordinates) # First pass meta = [] offset = 20 * n_spectra + 10 for idx, (x,y,z) in enumerate(p.coordinates): (mzs, intensities) = p.getspectrum(idx) n_points = len(mzs) meta.append((offset, n_points, np.sum(intensities), x, y)) offset = offset + n_points * 12 # Write data to stream... header = (0, n_spectra, 8) out_file.write(struct.pack('<BQB', header[0], header[1], header[2])) for meta_item in meta: out_file.write(struct.pack('<QLfHH', meta_item[0], meta_item[1], meta_item[2], meta_item[3], meta_item[4])) # Second pass for i in range(n_spectra): mzs, intensities = p.getspectrum(i) write_spectrum(out_file, (mzs, intensities)) return True
class IMSDataset: def __init__(self, fpath, micro_res=0.5, IMS_res=10): self.parser = ImzMLParser(fpath) self.micro_res = micro_res self.IMS_res = IMS_res self.IMS_px_in_micro = IMS_res / micro_res def __get_min_max_coords(self): coords = np.array(self.parser.coordinates) x_min, y_min, _ = np.min(coords, axis=0) x_max, y_max, _ = np.max(coords, axis=0) return x_min, y_min, x_max, y_max def to_columnar(self, mz_precision=4, dtype="uint32"): mzs, _ = self.parser.getspectrum(0) coords = np.array(dataset.parser.coordinates) x, y, _ = coords.T coords_df = pd.DataFrame( { "x": x, "y": y, "micro_x_topleft": x * self.IMS_px_in_micro - self.IMS_px_in_micro, "micro_y_topleft": y * self.IMS_px_in_micro - self.IMS_px_in_micro, "micro_px_width": np.repeat(self.IMS_px_in_micro, len(coords)), }, dtype=dtype, ) intensities = np.zeros((len(coords_df), len(mzs))) for i in range(len(coords)): _, coord_intensities = self.parser.getspectrum(i) intensities[i, :] = coord_intensities intensities = pd.DataFrame( intensities, columns=np.round(mzs, mz_precision).astype(str), dtype=dtype ) return coords_df.join(intensities) def to_array(self): x_min, y_min, x_max, y_max = self.__get_min_max_coords() mz_lengths = self.parser.mzLengths if not (mz_lengths.count(mz_lengths[0]) == len(mz_lengths)): raise ValueError("The number of m/z is not the same at each coordinate.") arr = np.zeros((x_max - x_min + 1, y_max - y_min + 1, mz_lengths[0])) for idx, (x, y, _) in enumerate(self.parser.coordinates): _, intensities = self.parser.getspectrum(idx) arr[x - x_min, y - y_min, :] = intensities return arr def write_zarr(self, path, dtype="i4"): arr = self.to_array() z_arr = zarr.open(path, mode="w", shape=arr.shape, compressor=None, dtype=dtype) z_arr[:, :, :] = arr
def get_ds_spots(ds_id): parser = ImzMLParser(f'raw_datasets/{ds_id}.imzML') grid_mask = np.load(f'spotting/grids/{ds_id}.npy') mask_names = json.load(open(f'spotting/grids/{ds_id}_mask_names.json')) # Make a mapping of coordinate -> spectrum index coords = np.array(parser.coordinates)[:, :2] base_coord = np.min(coords, axis=0) coord_to_idx = np.ones(np.max(coords, axis=0) - base_coord + 1, dtype='i') * -1 for i, (x, y) in enumerate(coords): coord_to_idx[x - base_coord[0], y - base_coord[1]] = i # Collect spectra for each mask item spots = {} for i, mask_name in enumerate(mask_names): if mask_name != 'background': spectra_ys, spectra_xs = np.nonzero(grid_mask == i) spectra = [ parser.getspectrum(idx) for idx in coord_to_idx[spectra_xs, spectra_ys] ] norm_spectra = [(mzs, ints * 1e6 / np.sum(ints)) for mzs, ints in spectra] mzs, ints = merge_spectra(norm_spectra) spots[mask_name] = mzs, ints, len(norm_spectra) return spots
def load_imzml_data_set(file): """ FLAG=0: SEND TO CSV, RETURN NOTHING FLAG=1: RETURN DICT OF DATAFRAMES FLAG=2: SEND TO CSV, RETURN DICT OF DATAFRAMES :param file: :param flag: :return: """ imzml_data_path = os.path.join(data_path_imzml, file) p = ImzMLParser(imzml_data_path) mass_data = {} intensity_data = {} x_cord, y_cord = p.coordinates[-1][0], p.coordinates[-1][1] for idx, (x, y, z) in enumerate(p.coordinates): # mzs are masses over charge of 1 ion # intensities correspond to the abundance of the particular ion mzs, intensities = p.getspectrum(idx) mass_data[idx] = mzs intensity_data[idx] = intensities # CONVERT DICTS TO DATA FRAMES df_mass_data = pd.DataFrame(mass_data) df_intensity_data = pd.DataFrame(intensity_data) f_name = file.split('.')[0] return {"mass": df_mass_data, "intensity": df_intensity_data, "x": x_cord, "y": y_cord, "f_name": f_name}
def write_corrected_msi(msi, output_file, tolerance, database_exactmass, step, dalim): # iterate throug each pixel of an MSI with ImzMLWriter(output_file) as w: p = ImzMLParser(msi, parse_lib='ElementTree') for idx, (x, y, z) in enumerate(p.coordinates): ms_mzs, ms_intensities = p.getspectrum(idx) peaks_ind = peak_selection(ms_intensities) peaks_mz = ms_mzs[peaks_ind] if len(peaks_mz) > 30: hit_exp, hit_errors = hits_generation(peaks_mz, database_exactmass, tolerance) if len(hit_errors) > 10: roi = hits_selection(hit_errors, step, tolerance, da_limit=dalim) if np.sum(roi) > 10: mz_error_model = create_lm(hit_exp, hit_errors, tolerance=tolerance, da_limit=dalim, step=step) if mz_error_model: corrected_mzs = correct_mz_lm( ms_mzs, mz_error_model) w.addSpectrum(corrected_mzs, ms_intensities, (x, y, z))
def run(self): from pyimzml.ImzMLParser import ImzMLParser import json n_peaks = [] s_min = [] s_max = [] s_ptp = [] pcts = [5, 25, 50, 75, 95] s_pcts = [] p = ImzMLParser(self.imzml_filename) for i, (x, y, z_) in enumerate(p.coordinates): mzs, ints = p.getspectrum(i) n_peaks.append(len(mzs)) s_min.append(np.min(ints)) s_max.append(np.max(ints)) s_ptp.append(np.ptp(ints)) s_pcts.append(list(np.percentile(ints, pcts))) stats = { 'n_peaks': n_peaks, 's_min': s_min, 's_max': s_max, 's_ptp': s_ptp, 's_pcts': s_pcts } with open(self.output().path, 'w+') as f: json.dump(stats, f) print 'wrote spec stats'
class FSImzMLReader(ImzMLReader): def __init__(self, path: Path): self.filename = find_file_by_ext(path, 'imzml') try: self._imzml_parser = ImzMLParser( self.filename, parse_lib='ElementTree', include_spectra_metadata=METADATA_FIELDS, ) except Exception as e: raise ImzMLError(format_exc()) from e super().__init__(self._imzml_parser) def iter_spectra(self, sp_idxs: Sequence[int]): for sp_idx in sp_idxs: mzs, ints = self._imzml_parser.getspectrum(sp_idx) assert len(mzs) == self._imzml_parser.mzLengths[ sp_idx], 'Incomplete .ibd file' assert len(ints) == self._imzml_parser.intensityLengths[ sp_idx], 'Incomplete .ibd file' assert len(mzs) == len( ints), f"Spectrum {sp_idx} mz and intensity counts don't match" sp_idx, mzs, ints = self._process_spectrum(sp_idx, mzs, ints) yield sp_idx, mzs, ints
def __read_all(self, filename): """ Internal helper function used to read all data. The function directly modifies the self.data entry. Data is now a list of datacubes. """ self.data = np.zeros(shape=self.shape, dtype=self.data_type) log_helper.info(__name__, 'Datacube shape is %s' % [self.data.shape]) reader = ImzMLParser(filename) log_helper.debug(__name__, 'READING ALL DATA!! GIVE ME RAM (please)!') # Compute the bin edges for reinterpolation if needed if self.imzml_type == self.available_imzml_types['processed']: shift = np.diff(self.mz).mean() bin_edges = np.append(self.mz, self.mz[-1] + shift) else: bin_edges = None for ind in xrange(0, len(reader.coordinates)): xidx, yidx = reader.coordinates[ind] # Coordinates may start at arbitrary locations, hence, we need to substract the minimum to recenter at (0,0) xidx -= self.x_pos_min yidx -= self.y_pos_min # Read the spectrum mz, intens = reader.getspectrum(ind) # Reinterpolate intensities if we are in processed mode if bin_edges is not None: intens, bin_edges_new = np.histogram(mz, bins=bin_edges, weights=intens) # Save the intensity values in our data cube self.data[xidx, yidx, :] = intens
def __read_all(self, filename): """ Internal helper function used to read all data. The function directly modifies the self.data entry. Data is now a list of datacubes. """ self.data = np.zeros(shape=self.shape, dtype=self.data_type) log_helper.info(__name__, 'Datacube shape is %s' % [self.data.shape]) reader = ImzMLParser(filename) log_helper.debug(__name__,'READING ALL DATA!! GIVE ME RAM (please)!') # Compute the bin edges for reinterpolation if needed if self.imzml_type == self.available_imzml_types['processed']: shift = np.diff(self.mz).mean() bin_edges = np.append(self.mz, self.mz[-1]+ shift) else: bin_edges = None for ind in xrange(0, len(reader.coordinates)): xidx, yidx = reader.coordinates[ind] # Coordinates may start at arbitrary locations, hence, we need to substract the minimum to recenter at (0,0) xidx -= self.x_pos_min yidx -= self.y_pos_min # Read the spectrum mz, intens = reader.getspectrum(ind) # Reinterpolate intensities if we are in processed mode if bin_edges is not None: f = interpolate.interp1d(mz,intens,fill_value=0,bounds_error=False) intens = f(self.mz) #intens, bin_edges_new = np.histogram(mz, bins=bin_edges, weights=intens) # Save the intensity values in our data cube self.data[xidx, yidx, :] = intens
def spectrum_iter(self): """ Generator function that yields a position and associated spectrum for a selected datacube type. :yield: (xidx, yidx) a tuple of ints representing x and y position in the image :yield: yi, a numpy 1D-array of floats containing spectral intensities at the given position and for the selected datacube type """ reader = ImzMLParser(self.basename) for idx in xrange(0, len(reader.coordinates)): xidx, yidx, zidx = reader.coordinates[idx] # Coordinates may start at arbitrary locations, hence, we need to substract the minimum to recenter at (0,0) xidx -= self.x_pos_min yidx -= self.y_pos_min mz, intens = reader.getspectrum(idx) # Rehistogram the data if we are in procesed mode if self.imzml_type == self.available_imzml_types['processed']: # shift = np.diff(self.mz).mean() # bin_edges = np.append(self.mz, self.mz[-1]+ shift) f = interpolate.interp1d(mz, intens, fill_value=0, bounds_error=False) intens = f(self.mz) # intens, bin_edges_new = np.histogram(mz, bins=bin_edges, weights=intens) yield (xidx, yidx), np.asarray(intens)
def get_spec(x, y1, y2, imzML_file): parser = ImzMLParser(imzML_file) part_map = dict() for y in range(y1, y2): try: idx = parser.coordinates.index((x, y, 1)) spec_map = tupel2map(parser.getspectrum(idx)) part_map[idx] = np.array(list(spec_map.values())) except: print(f"({x}, {y}, 1) is not in list.") return part_map
def main(argv): from pyimzml.ImzMLParser import ImzMLParser inputfile = '' outputfile = '' try: opts, args = getopt.getopt(argv, "hi:o:", ["ifile=", "ofile="]) except getopt.GetoptError: print('test.py -i <inputfile> -o <outputfile>') sys.exit(2) for opt, arg in opts: if opt == '-h': print('test.py -i <inputfile> -o <outputfile>') sys.exit() elif opt in ("-i", "--ifile"): inputfile = arg elif opt in ("-o", "--ofile"): outputfile = arg if inputfile == '': print('test.py -i <inputfile> -o <outputfile>') raise IOError('input file not specified') if outputfile == '': outputfile = inputfile + '.imzML' imzml = ImzMLParser(inputfile) spectra = [] with ImzMLWriter(outputfile, mz_dtype=np.float32, intensity_dtype=np.float32) as writer: for i, coords in enumerate(imzml.coordinates): mzs, intensities = imzml.getspectrum(i) writer.addSpectrum(mzs, intensities, coords) spectra.append((mzs, intensities, coords)) imzml = ImzMLParser(outputfile) spectra2 = [] for i, coords in enumerate(imzml.coordinates): mzs, intensities = imzml.getspectrum(i) spectra2.append((mzs, intensities, coords)) print(spectra[0] == spectra2[0])
def import_imzml_dataset(filepath): """Reads an .imzml and stores Returns: list:List of spectra """ p = ImzMLParser(filepath) spectra = [] for idx, (x,y,z) in enumerate(p.coordinates): mzs, intensities = p.getspectrum(idx) spectra.append(spectrum(mzs, intensities, x, y, z)) return spectra
def main(argv): from pyimzml.ImzMLParser import ImzMLParser inputfile = '' outputfile = '' try: opts, args = getopt.getopt(argv,"hi:o:",["ifile=","ofile="]) except getopt.GetoptError: print('test.py -i <inputfile> -o <outputfile>') sys.exit(2) for opt, arg in opts: if opt == '-h': print('test.py -i <inputfile> -o <outputfile>') sys.exit() elif opt in ("-i", "--ifile"): inputfile = arg elif opt in ("-o", "--ofile"): outputfile = arg if inputfile == '': print('test.py -i <inputfile> -o <outputfile>') raise IOError('input file not specified') if outputfile=='': outputfile=inputfile+'.imzML' imzml = ImzMLParser(inputfile) spectra = [] with ImzMLWriter(outputfile, mz_dtype=np.float32, intensity_dtype=np.float32) as writer: for i, coords in enumerate(imzml.coordinates): mzs, intensities = imzml.getspectrum(i) writer.addSpectrum(mzs, intensities, coords) spectra.append((mzs, intensities, coords)) imzml = ImzMLParser(outputfile) spectra2 = [] for i, coords in enumerate(imzml.coordinates): mzs, intensities = imzml.getspectrum(i) spectra2.append((mzs, intensities, coords)) print(spectra[0] == spectra2[0])
class ImzmlDataset(BaseDataset): def __init__(self, filename): from pyimzml.ImzMLParser import ImzMLParser super(ImzmlDataset, self).__init__(filename) self.imzml = ImzMLParser(filename) self.coordinates = np.asarray(self.imzml.coordinates) self.step_size = [1, 1, 1] #fixme get pixel size from header data def get_spectrum(self, ix): mzs, counts = self.imzml.getspectrum(ix) return [np.asarray(mzs), np.asarray(counts)] #todo return MassSpectrum def get_image(self, mz, tol): im = self.imzml.getionimage(mz, tol) return im
def save_data_to_csv(filename): data_control_day_03 = os.path.join(data_path, filename) p = ImzMLParser(data_control_day_03) mass_data = {} intensity_data = {} for idx, (x, y, z) in enumerate(p.coordinates): # mzs are masses over charge of 1 ion # intensities correspond to the abundance of the particular ion mzs, intensities = p.getspectrum(idx) mass_data[idx] = mzs intensity_data[idx] = intensities df1 = pd.DataFrame(mass_data) df2 = pd.DataFrame(intensity_data) df1.to_csv('mass_data.csv') df2.to_csv('intensities.csv')
def import_spectra(filepath, spectra_format="imzml"): ############### IMZML if spectra_format == "imzml" or spectra_format == "imzML": ##### Import the libraries install_required_packages("pyimzml") from pyimzml.ImzMLParser import ImzMLParser ##### Parse the imzML file parsed_imzml = ImzMLParser(filepath) ##### Generate the list of spectra spectra = [] for i,(x,y) in enumerate(parsed_imzml.coordinates): spectra.append(parsed_imzml.getspectrum(i)) ############### XMASS elif spectra_format == "brukerflex" or spectra_format == "xmass" or spectra_format == "Xmass": pass ############### Return the list of spectra return (spectra)
def save_data_to_csv(filename, type): data_control_day_03 = os.path.join(data_path, filename) p = ImzMLParser(data_control_day_03) mass_data = {} intensity_data = {} coords = {} for idx, (x, y, z) in enumerate(p.coordinates): # mzs are masses over charge of 1 ion # intensities correspond to the abundance of the particular ion mzs, intensities = p.getspectrum(idx) mass_data[idx] = mzs intensity_data[idx] = intensities coords[idx] = {"x": x, "y": y, "z": z} df1 = pd.DataFrame(mass_data) df2 = pd.DataFrame(intensity_data) df3 = pd.DataFrame.from_dict(coords, orient="index") df1.to_csv('csvData/mass_data_{type}.csv'.format(type=type)) df2.to_csv('csvData/intensities_{type}.csv'.format(type=type)) df3.to_csv('csvData/coords_{type}.csv'.format(type=type))
def run(self): from pyimzml.ImzMLParser import ImzMLParser import json p = ImzMLParser(self.imzml_filename) im = {} for im_type in self.im_types: im[im_type] = np.zeros((p.imzmldict["max count of pixels y"], p.imzmldict["max count of pixels x"])) for i, (x, y, z_) in enumerate(p.coordinates): mzs, ints = p.getspectrum(i) for im_type in self.im_types: im[im_type][y - 1, x - 1] = getattr(np, im_type)(ints) for ii, im_type in enumerate(self.im_types): result = { 'im_vect': [_mz for _mz in im[im_type].flatten()], 'im_shape': np.shape(im[im_type]) } with open(self.output()[ii].path, 'w+') as f: json.dump(result, f)
def get_spectra_df_from_parser(p: ImzMLParser, sp_idxs: Iterable[int]): peaks_dfs = [] spectra = [] for i in sp_idxs: mzs, ints = p.getspectrum(i) x, y, z = p.coordinates[i] mask = ints > 0 mzs = mzs[mask].astype(np.float64) ints = ints[mask].astype(np.float32) peaks_dfs.append(pd.DataFrame({'sp': i, 'mz': mzs, 'ints': ints})) spectra.append((i, x, y, z, np.min(mzs), np.max(mzs), np.sum(ints))) peaks_df = pd.concat(peaks_dfs) spectra_df = pd.DataFrame( spectra, columns=['sp', 'x', 'y', 'z', 'mz_lo', 'mz_hi', 'tic']).set_index('sp') return peaks_df, spectra_df
def search_pixel(self, x: int, y: int) -> np.ndarray: start = time.time() log(start, f"pixel parsing imzml at {self.imzml_path}") p = ImzMLParser(self.imzml_path) n = 0 coordinate_x = p.coordinates[n][0] coordinate_y = p.coordinates[n][1] if ((x, y, 1) in p.coordinates): n = p.coordinates.index((x, y, 1)) coordinate_x = p.coordinates[n][0] coordinate_y = p.coordinates[n][1] mzs, ints = p.getspectrum(n) log(start, "done") return dict({ 'mzs': mzs.tolist(), 'ints': ints.tolist(), 'x': coordinate_x, 'y': coordinate_y })
def main(input_directory, output_directory, num_bins, input_kw=''): os.chdir(input_directory) files = [file for file in glob.glob("*.imzML") if input_kw in file] for f in files: print(f) p = ImzMLParser(f) shape = (p.imzmldict['max count of pixels x'], p.imzmldict['max count of pixels y']) spectrums = [p.getspectrum(i) for i in range(len(p.coordinates))] all_mzs, all_intensities = zip(*spectrums) peaks, peak_intensities = [], [] for i,intensities in enumerate(all_intensities): print(f'Getting Intensities: {i}/{len(all_intensities)}') t = signal.find_peaks(intensities, 50*1000) peaks.append(all_mzs[i][idxs_to_bool(t[0], len(intensities))]) peak_intensities.append(t[1]['peak_heights']) number_of_bins = num_bins min_mzs = 450 max_mzs = 1000 bins = linspace(min_mzs, max_mzs, number_of_bins) col_set = list(range(len(p.coordinates) - 1)) aggregated_df = pd.DataFrame(columns=list(bins)) master_df = pd.DataFrame() for pixel, (peak_l, intensity_l) in enumerate(zip(peaks, peak_intensities)): print(f'Binning: {pixel}/{len(p.coordinates)}') curr_pixel = pd.DataFrame({'mzs': peak_l, 'intensities': intensity_l}) pixel_binned = {} for index in range(1, len(bins)): lower_bound = bins[index - 1] upper_bound = bins[index] curr_bin = curr_pixel[curr_pixel['mzs'].between(lower_bound, upper_bound)] bin_intensity = curr_bin['intensities'].sum() pixel_binned[lower_bound] = bin_intensity temp = pd.DataFrame([pixel_binned]) aggregated_df = aggregated_df.append(temp) data_name = path.splitext(f)[0] outfile = path.join(output_directory,data_name) aggregated_df.to_csv(f'{outfile}_{shape[0]}x{shape[1]}_aggregated.csv')
def spectrum_iter(self): """ Generator function that yields a position and associated spectrum for a selected datacube type. :yield: (xidx, yidx) a tuple of ints representing x and y position in the image :yield: yi, a numpy 1D-array of floats containing spectral intensities at the given position and for the selected datacube type """ reader = ImzMLParser(self.basename) for idx in xrange(0, len(reader.coordinates)): xidx, yidx, zidx = reader.coordinates[idx] # Coordinates may start at arbitrary locations, hence, we need to substract the minimum to recenter at (0,0) xidx -= self.x_pos_min yidx -= self.y_pos_min mz, intens = reader.getspectrum(idx) # Rehistogram the data if we are in procesed mode if self.imzml_type == self.available_imzml_types['processed']: # shift = np.diff(self.mz).mean() # bin_edges = np.append(self.mz, self.mz[-1]+ shift) f = interpolate.interp1d(mz,intens,fill_value=0,bounds_error=False) intens = f(self.mz) # intens, bin_edges_new = np.histogram(mz, bins=bin_edges, weights=intens) yield (xidx, yidx), np.asarray(intens)
class IMZMLExtract: def __init__(self, fname, specStart=0): #fname = "/mnt/d/dev/data/190724_AR_ZT1_Proteins/190724_AR_ZT1_Proteins_spectra.imzML" self.fname = fname self.parser = ImzMLParser(fname) self.dregions = None self.mzValues = self.parser.getspectrum(0)[0] self.specStart = specStart if self.specStart != 0: self.mzValues = self.mzValues[self.specStart:] print("WARNING: SPECTRA STARTING AT POSITION", self.specStart) self.find_regions() def get_region_ids(self): return [x for x in self.dregions] def get_spectrum(self, specid): spectra1 = self.parser.getspectrum(specid)[1] return spectra1 def compare_spectra(self, specid1, specid2): spectra1 = self.parser.getspectrum(specid1)[1] spectra2 = self.parser.getspectrum(specid2)[1] ssum = 0.0 len1 = 0.0 len2 = 0.0 assert (len(spectra1) == len(spectra2)) for i in range(0, len(spectra1)): ssum += spectra1[i] * spectra2[i] len1 += spectra1[i] * spectra1[i] len2 += spectra2[i] * spectra2[i] len1 = math.sqrt(len1) len2 = math.sqrt(len2) return ssum / (len1 * len2) def get_mz_index(self, value): curIdxDist = 1000000 curIdx = 0 for idx, x in enumerate(self.mzValues): dist = abs(x - value) if dist < curIdxDist: curIdx = idx curIdxDist = dist return curIdx def get_region_spectra(self, regionid, back_spectrum=None): if not regionid in self.dregions: return None outspectra = {} for coord in self.dregions[regionid]: spectID = self.parser.coordinates.index(coord) if spectID == None or spectID < 0: print("Invalid coordinate", coord) continue cspec = self.parser.getspectrum(spectID)[1] cspec = cspec[self.specStart:] if len(cspec) == 0: print("0 spec") continue if back_spectrum: cspec = np.subtract(cspec, back_spectrum) cspec = cspec / np.max(cspec) cspec = cspec - np.min(cspec) outspectra[coord] = cspec return outspectra def get_region_range(self, regionid): allpixels = self.dregions[regionid] minx = min([x[0] for x in allpixels]) maxx = max([x[0] for x in allpixels]) miny = min([x[1] for x in allpixels]) maxy = max([x[1] for x in allpixels]) minz = min([x[2] for x in allpixels]) maxz = max([x[2] for x in allpixels]) spectraLength = 0 for coord in self.dregions[regionid]: spectID = self.parser.coordinates.index(coord) if spectID == None or spectID < 0: print("Invalid coordinate", coord) continue splen = self.parser.mzLengths[spectID] - self.specStart spectraLength = max(spectraLength, splen) return (minx, maxx), (miny, maxy), (minz, maxz), spectraLength def get_region_shape(self, regionid): rr = self.get_region_range(regionid) xr, yr, zr, sc = rr imzeShape = [xr[1] - xr[0] + 1, yr[1] - yr[0] + 1] if zr[1] - zr[0] + 1 > 1: imzeShape.append(zr[1] - zr[0] + 1) imzeShape.append(sc) spectraShape = tuple(imzeShape) return spectraShape def get_region_array(self, regionid, back_spectrum=None): xr, yr, zr, sc = self.get_region_range(regionid) rs = self.get_region_shape(regionid) print(rs) sarray = np.zeros(rs, dtype=np.float32) coord2spec = self.get_region_spectra(regionid, back_spectrum) for coord in coord2spec: xpos = coord[0] - xr[0] ypos = coord[1] - yr[0] spectra = coord2spec[coord] if len(spectra) < sc: spectra = np.pad(spectra, ((0, 0), (0, sc - len(spectra))), mode='constant', constant_values=0) sarray[xpos, ypos, :] = spectra return sarray def find_regions(self): if os.path.isfile(self.fname + ".regions"): print("Opening regions file for", self.fname) with open(self.fname + ".regions", 'r') as fin: self.dregions = defaultdict(list) for line in fin: line = line.strip().split("\t") coords = [int(x) for x in line] self.dregions[coords[3]].append(tuple(coords[0:3])) for regionid in self.dregions: allpixels = self.dregions[regionid] minx = min([x[0] for x in allpixels]) maxx = max([x[0] for x in allpixels]) miny = min([x[1] for x in allpixels]) maxy = max([x[1] for x in allpixels]) else: self.dregions = self.__detectRegions(self.parser.coordinates) with open(self.fname + ".regions", 'w') as outfn: for regionid in self.dregions: for pixel in self.dregions[regionid]: print("\t".join([str(x) for x in pixel]), regionid, sep="\t", file=outfn) def __dist(self, x, y): assert (len(x) == len(y)) dist = 0 for pidx in range(0, len(x)): dist += abs(x[pidx] - y[pidx]) return dist def __detectRegions(self, allpixels): allregions = [] for idx, pixel in enumerate(allpixels): if len(allregions) == 0: allregions.append([pixel]) continue if idx % 1000 == 0: print("At pixel", idx, "of", len(allpixels), "with", len(allregions), "regions") accRegions = [] for ridx, region in enumerate(allregions): for coord in region: if self.__dist(coord, pixel) <= 1: accRegions.append(ridx) break if len(accRegions) == 0: allregions.append([pixel]) elif len(accRegions) == 1: for ridx in accRegions: allregions[ridx].append(pixel) elif len(accRegions) > 1: bc = len(allregions) totalRegion = [] for ridx in accRegions: totalRegion += allregions[ridx] for ridx in sorted(accRegions, reverse=True): del allregions[ridx] allregions.append(totalRegion) ac = len(allregions) assert (ac == bc + 1 - len(accRegions)) outregions = {} for i in range(0, len(allregions)): outregions[i] = [tuple(x) for x in allregions[i]] return outregions def avg_background(self, background_id): xs = (self.get_region_range(background_id)[0][0], self.get_region_range(background_id)[0][1]) ys = (self.get_region_range(background_id)[1][0], self.get_region_range(background_id)[1][1]) mz2intens = {} for x in range(xs[0], xs[1]): for y in range(ys[0], ys[1]): try: idx = self.parser.coordinates.index((x, y, 1)) tupl = self.parser.getspectrum(idx) sp = dict(zip(tupl[0], tupl[1])) for key in sp: if key in mz2intens: mz2intens[key].append(sp[key]) else: mz2intens[key] = list() mz2intens[key].append(sp[key]) except: print(f"({x}, {y}, 1) is not in list.") mz2avg = {} for key in mz2intens: mz2avg[key] = sum(mz2intens[key]) / len(mz2intens[key]) return list(mz2avg.values())
def __compute_file_info(cls, filename, resolution): ## TODO completely refactor this to make it smartly handle profile or centroid datasets ## TODO: centroid datasets should take in a user parameter "Resolution" and resample data at that resolution ## TODO: profile datasets should work as is ## TODO: checks for profile data vs. centroid data on the variation in length of ['m/z array'] """ Internal helper function used to compute the mz axis, data type for the intensities, format type :return: Numpy array with mz axis :return: string with data type :return: imzml file type :return: """ reader = ImzMLParser(filename) # Read the first spectrum mz_axes, intens = reader.getspectrum(0) # NOTE: mz_axes is a tuple # Read the coordinates coordinates = np.asarray(reader.coordinates) # Determine the data type for the internsity values dtype = np.asarray(intens).dtype.str # Compute the mz axis and file type file_type = cls.available_imzml_types['continuous'] min_mz, max_mz = np.amin(mz_axes), np.amax(mz_axes) for ind in range(coordinates.shape[0] ): #for ind, loc in enumerate(reader.coordinates): mz, intens = reader.getspectrum(ind) if mz == mz_axes: pass else: file_type = cls.available_imzml_types['processed'] if min_mz > np.amin(mz): min_mz = np.amin(mz) if max_mz < np.amax(mz): max_mz = np.amax(mz) # Reinterpolate the mz-axis if we have a processed mode imzml file if file_type == cls.available_imzml_types['processed']: f = np.ceil(1e6 * np.log(max_mz / min_mz) / resolution) mz_axes = np.logspace(np.log10(min_mz), np.log10(max_mz), f) log_helper.info( __name__, "Reinterpolated m/z axis for processed imzML file") # Construct the imzml metadata information dataset_metadata = metadata_dict() instrument_metadata = metadata_dict() method_metadata = metadata_dict() for k, v in reader.imzmldict.iteritems(): dataset_metadata[k] = metadata_value(name=k, value=v, unit=None, description=k, ontology=None) # Delete the parser and read the metadata del reader # Parse the metadata for the file. We try to parse only the header and ignore the # <run > group in the XML file to avoid going throught the whole file again # while extracting the majority of the relevant metadata try: with open(filename, 'r') as ins: metdata_header = '' for line in ins: if '<run' in line: break else: metdata_header += line metdata_header += '</mzML>' metdata_header_dict = xmltodict.parse(metdata_header)['mzML'] for k, v in metdata_header_dict.iteritems(): store_value = metadata_value( name=k, value=v, unit=None, description=str(k) + " extracted from imzML XML header.", ontology=None) if k == 'instrumentConfigurationList': instrument_metadata[k] = store_value elif k == 'dataProcessingList': method_metadata[k] = store_value elif k == 'scanSettingsList': dataset_metadata[k] = store_value elif k == 'softwareList': method_metadata[k] = store_value elif k == 'sampleList': method_metadata[k] = store_value else: dataset_metadata[k] = store_value dataset_metadata['imzml_xml_metadata_header'] = metadata_value( name='imzml_xml_metadata_header', value=metdata_header, unit=None, description='XML imzML header', ontology=None) except: log_helper.warning( __name__, "Extraction of additional imzML metadata failed") return coordinates, np.asarray( mz_axes ), dtype, file_type, dataset_metadata, instrument_metadata, method_metadata
#%% # # Obtain data # !mkdir msi_data # !wget -O msi_data/test_POS.imzML https://www.ebi.ac.uk/metabolights/ws/studies/MTBLS487/download/9c756a3f-2c96-4449-8dd7-d64540df5c6c\?file\=test_POS.imzML # !wget wget -O msi_data/test_POS.ibd https://www.ebi.ac.uk/metabolights/ws/studies/MTBLS487/download/9c756a3f-2c96-4449-8dd7-d64540df5c6c\?file\=test_POS.ibd #%% # Parse data p = ImzMLParser('msi_data/test_POS.imzML') dimensions = (max(coor[0] for coor in p.coordinates), max(coor[1] for coor in p.coordinates)) # We know that z has only one value: 1 picture = [[None for y in range(dimensions[1])] for x in range(dimensions[0])] for idx, (x, y, z) in enumerate(p.coordinates): mzs, intensities = p.getspectrum(idx) s = Spectrum(confs=list(zip(mzs, intensities)), label=str(x - 1) + ", " + str(y - 1)) # remove peptide artifacts s.confs = [x for x in s.confs if x[0] < 1000] picture[x - 1][y - 1] = s #%% # Apply peak-picking procedure for row in picture: for spectrum in row: spectrum.confs = spectrum.find_peaks() spectrum.confs = spectrum.centroid(0.5) #%% from MasSpOT import perform_clusterization label_picture = perform_clusterization(picture, dimensions)
def get_consensus(cluster_id, matrix, dist_dot_product, ids, imzMLfile, xs, ys, plots=False): parser = ImzMLParser(imzMLfile) cluster_ids = get_cluster_elements(cluster_id, matrix, parser, xs, ys) cluster_matrix_ids = [ids.index(elem) for elem in cluster_ids] if len(cluster_matrix_ids) == 1: return tupel2map(parser.getspectrum(cluster_matrix_ids[0])) distance = np.zeros((len(cluster_matrix_ids), len(cluster_matrix_ids))) for i in range(len(cluster_matrix_ids)): for j in range(len(cluster_matrix_ids)): distance[i, j] = distance[j, i] = dist_dot_product[cluster_matrix_ids[i], cluster_matrix_ids[j]] print(distance.shape) np.fill_diagonal(distance, 0) Z = linkage(squareform(distance), method='average', metric='cosine') c = fcluster(Z, t=0, criterion='distance') order = [ x for _, x in sorted(zip(c, range(len(cluster_matrix_ids))), key=lambda pair: pair[0]) ] new_spectum = {} spectra_list = list() for i in range(len(cluster_matrix_ids) - 1): if i == 0: new_spectum = average_spectra( tupel2map(parser.getspectrum(cluster_ids[i])), tupel2map(parser.getspectrum(cluster_ids[i + 1]))) else: left = distance[i - 1, i] right = distance[i, i + 1] if left > right: new_spectum = average_spectra( new_spectum, tupel2map(parser.getspectrum(cluster_ids[i]))) else: spectra_list.append(new_spectum) new_spectum = average_spectra( tupel2map(parser.getspectrum(cluster_ids[i])), tupel2map(parser.getspectrum(cluster_ids[i + 1]))) if not spectra_list: spectra_list.append(new_spectum) consensus = spectra_list[0] for spect in spectra_list: consensus = average_spectra(consensus, spect) else: consensus = new_spectum if plots: plt.figure() for i in cluster_ids: spectrum = tupel2map(parser.getspectrum(i)) lists = spectrum.items() x, y = zip(*lists) # unpack a list of pairs into two tuples plt.plot(x, y / max(y), label="Spectral ID {}".format(i)) lists = consensus.items() x, y = zip(*lists) # unpack a list of pairs into two tuples plt.plot(x, y / max(y), label="Consensus", c='black') plt.xlabel("m/z", fontsize=20) plt.ylabel("Intensity (normalized by maximum internsity)", fontsize=20) plt.legend(fontsize=20) plt.xticks(fontsize=20) plt.yticks(fontsize=20) plt.show() return consensus
def write_msi_to_hdf(self, h5f_fname_w_path, imzml_fname_w_path, norm=''): """ Converts imzML data to HDF5 format. Iterates through imzML and, one spectrum at a time, reads and writes spectral data to h5 as new raw dataset. Spectra are stored to HDF5 dataset in the order in which they are read from imzML file. """ imzml = ImzMLParser(imzml_fname_w_path) #Infer dataset dimensions from first spectrum in imzML samp_mz, samp_int = imzml.getspectrum(0) length = len(samp_mz) self.mzsaxis = np.asarray(samp_mz) height = len(imzml.mzOffsets) del samp_int, samp_mz #Create and open hdf5 file hf = h5py.File(h5f_fname_w_path, 'w') print('\n%i spectra x %i mz data points'%(height, length)) print() print('Importing...') print('') grp = hf.create_group('MALDI_001') dset1 = grp.create_dataset('Intensities', shape=(height, length), chunks=(1, 1024), dtype='float64') grp.create_dataset('m_over_z', data=self.mzsaxis) dset3 = grp.create_dataset('Coordinates', shape=(height,2)) grp.create_group('Normalization Factors') # Iterates through spectra contained in .ibd binary using getspectrum() # For each spectrum, writes mz axis and intensity list as line in # appropriate datasets in group "MALDI_001" summed_ints = np.zeros(length) max_int = 0 for i in range(height): mz, intensity = imzml.getspectrum(i) try: coordinates = imzml.get_physical_coordinates(i) except KeyError: imzml.imzmldict['pixel size y'] = imzml.imzmldict['pixel size x'] coordinates = imzml.get_physical_coordinates(i) point = np.asarray([coordinates[0], coordinates[1]]) dset1[i] = intensity dset3[i] = point if max(intensity) > max_int: max_int = max(intensity) summed_ints = summed_ints + intensity if self.log and i > 0: if i % 1000 == 0: print('%i / %i'%((i, height))) average_spectrum = summed_ints / height grp.create_dataset('Average spectrum', data=(average_spectrum), dtype='float64') del summed_ints, mz, intensity, coordinates, point, max_int #Revert Numpy error handling to default setting (print) np.seterr(all = 'print') # Clean up, flush buffer, close file print('Finished importing!') if norm=='' or norm.upper()=='NONE': pass else: self._calculate_new_normalization_(norm, hf) hf.flush() hf.close() return
class DefectFilter: def __init__(self, filename): """ Initialize Filtering Framework from an imzml file """ self.spectrum = ImzMLParser(filename) self.mzlist = [] self.intensity_list = [] self.filename = [] self.filter_spec_mass = np.zeros(np.shape(self.mzlist)) self.filter_spec_intens = np.zeros(np.shape(self.intensity_list)) for idx, (x, y, z) in enumerate(self.spectrum.coordinates): self.mzs, self.intensities = self.spectrum.getspectrum(idx) self.mzlist.append(self.mzs) self.intensity_list.append(self.intensities) def MSIFilter(self, coi, alpha): "Filter imzML file for complex of interest" if coi == "N-Glycan": self.glycanFilter() truefiltertime = time.time() self.filterIntens(self.intensity_list, self.mzlist) truefilterend = time.time() print("Removal of 0 values: " + str(truefilterend - truefiltertime)) self.glycan_intens = [] for i in range(len(self.filtered_intens)): kendricktime = time.time() self.kendrickMass(self.filtered_mzs[i]) kendrickend = time.time() print("KMD Algorithm Time: " + str(kendrickend - kendricktime)) filtertime = time.time() probFilter = self.glycanProb(self.KM, self.KMD, alpha, self.filtered_intens[i]) filterend = time.time() print("Prob Time: " + str(filterend - filtertime)) self.glycan_intens.append(probFilter) outname = "Filtered_mz_" + str(np.random.randint(100000)) with ImzMLWriter(outname) as w: for i in range(len(self.filtered_mzs)): w.addSpectrum(self.filtered_mzs[i], self.glycan_intens[i], self.spectrum.coordinates[i]) print("File Written to : " + outname) def glycanFilter(self, max_defect=3): """create a line for the glycan filter based on ASMS 2019 poster """ self.glycanMD = self.mzs * 3.5 * 10**(-4) + 0.0039 self.glycanDict = {} self.glycanSigma = 0.0173 for i in range(len(self.mzs)): self.glycanDict[self.mzs[i]] = self.glycanMD[i] def glycanProb(self, KM, KMD, alpha, intensities, dist='Norm'): # Replace gylcanProb with a t-test or z-test from software """ Provide an intensity spectrum filtered for KMD values within alpha of known values """ """ for single spectrum """ glycanFilterInt = intensities.copy() for i in range(len(KM)): xbar = self.glycanDict[self.KM[i]] bestProb = 1 for j in KMD[i]: if dist == 'Norm': prob = st.norm.cdf( abs(xbar - j), loc=0, scale=self.glycanSigma) - 0.5 else: break if prob < bestProb: bestProb = prob if bestProb > alpha: glycanFilterInt[i] = 0 return glycanFilterInt def kendrickMass(self, mzs, max_defect=3): """ for single spectrum """ # Start with KMs between 0 and 1: self.KMdict = {} for mz in mzs: self.KMdict[mz] = [] defect, mass = np.modf(mz) for i in range(max_defect + 1): if mz - i in self.KMdict.keys(): self.KMdict[mz - i].append(defect + i) else: continue self.KM = list(self.KMdict.keys()) self.KMD = list(self.KMdict.values()) def kendrickMassList(self, mzs): """ for single spectrum """ KM = mzs * 14 / 14.01565 self.KM2.append(KM) KMD = np.floor(KM) - KM self.KMD2.append(KMD) def KMDplot(self): axes = plt.axes() axes.set_ylim([-1, 0]) for i in range(len(self.filtered_mass)): plt.scatter(self.filtered_mass[i], self.KMD2[i]) plt.show() def filterIntens(self, intens_list, mzlist, thresh=0): print("iteration") self.filtered_intens = [] self.filtered_mzs = [] self.filter_idx = [] for i in range(len(intens_list)): intens = [] mzs = [] idx = [] j = 0 if np.all(intens_list[i] <= thresh): continue else: for k in range(len(intens_list[i])): if intens_list[i][k] > thresh: intens.append(intens_list[i][k]) mzs.append(mzlist[i][k]) idx.append((i, j)) j += 1 self.filtered_intens.append(intens) self.filtered_mzs.append(mzs) self.filter_idx.append(idx) def kendrickFilter(self, thresh, intens_list, mzlist): """ Takes full spectrum lists not single spectrum """ for i in range(len(intens_list)): self.filterIntens(thresh, intens_list[i], mzlist[i]) for i in self.filtered_mass: self.kendrickMassList(i)
imze.get_region_range(region)[0][1] + 1) ys = (imze.get_region_range(region)[1][0], imze.get_region_range(region)[1][1] + 1) def tupel2map(spec): return dict(zip(spec[0], spec[1])) mz2intens = {} print('Calculating pixel map...') for x in range(xs[0], xs[1]): for y in range(ys[0], ys[1]): try: idx = parser.coordinates.index((x, y, 1)) sp = tupel2map(parser.getspectrum(idx)) for k in sp: if k in mz2intens: mz2intens[k].append(sp[k]) else: mz2intens[k] = list() mz2intens[k].append(sp[k]) except: print(f"({x}, {y}, 1) is not in list.") mz2avg = {} for key in mz2intens: mz2avg[key] = sum(mz2intens[key]) / len(mz2intens[key]) if save: filename = imzMLfile + "." + str(region) + "_avg" + ".pickle"
def imzml_to_hdf5(imzml_file_path, out_path, mir_path): dataset_name, _ = os.path.splitext(os.path.basename(imzml_file_path)) print() print('Loading', imzml_file_path) p = ImzMLParser(imzml_file_path, parse_lib='ElementTree') print() print('Loading done!') # check if all spectra have the same mz axis num_spectra = len(p.mzLengths) mz_index = np.array(p.getspectrum(0)[0]) mz_index_length = len(mz_index) print() print('m/z consistency check ...') # '0' = mz values, '1' = intensities mz_index = np.unique( np.concatenate([p.getspectrum(i)[0] for i in range(num_spectra)])) if len(mz_index) != mz_index_length: print( 'WARNING: Not all spectra have the same mz values. Missing values are filled with zeros!' ) print() print('m/z consistency check done!') # DEV: use small range to test bigger datasets on little memory mz_selection = slice(None) # range(100) # load all intensities into a single data frame # resulting format: # 1 row = 1 spectrum # 1 column = all intensities for 1 mz, that is all values for a single intensity image print() print('DataFrame creation ...') msi_frame = pd.DataFrame(intensities_generator(p, mz_index, mz_selection), columns=mz_index[mz_selection]) print('DataFrame creation done') print() print("DataFrame size equals: %i pixels, %i mz-values" % msi_frame.shape) print() if mir_path: print() print('Peak picking ...') msi_frame = select_peaks_from_msi_frame(msi_frame, mir_path) print() print('Peak picking done!') msi_frame = msi_frame.fillna(0) xycoordinates = np.asarray(p.coordinates)[:, [0, 1]] multi_index = pd.MultiIndex.from_arrays(xycoordinates.T, names=("grid_x", "grid_y")) msi_frame.set_index(multi_index, inplace=True) msi_frame["dataset"] = [dataset_name] * msi_frame.shape[0] msi_frame = msi_frame.set_index("dataset", append=True) # For some data sets a small fraction of intensities (~0.1%) have been # negative, this might be a numerical issue in the imzml export by bruker. # DEV ad-hoc fix (couldn't figure out the cause or a more reasonable fix so far) msi_frame[msi_frame < 0] = 0 print() print('Write DataFrame ...') h5_store_path = os.path.join(out_path, dataset_name + '.h5') save_name_frame = 'msi_frame_' + dataset_name with pd.HDFStore(h5_store_path, complib='blosc', complevel=9) as store: store[save_name_frame] = msi_frame print() print('done. Script completed!')
class inMemoryIMS(): def __init__(self, filename, min_mz=0., max_mz=np.inf, min_int=0., index_range=[], cache_spectra=True, do_summary=True, norm='none', norm_args={}, spectrum_type='centroids'): file_size = os.path.getsize(filename) self.load_file(filename, min_mz, max_mz, min_int, index_range=index_range, cache_spectra=cache_spectra, do_summary=do_summary, norm=norm, norm_args=norm_args, spectrum_type=spectrum_type) def load_file(self, filename, min_mz=0, max_mz=np.inf, min_int=0, index_range=[], cache_spectra=True, do_summary=True, norm=[], norm_args={}, spectrum_type='centroids'): # parse file to get required parameters # can use thin hdf5 wrapper for getting data from file self.file_dir, self.filename = os.path.split(filename) self.filename, self.file_type = os.path.splitext(self.filename) self.file_type = self.file_type.lower() self.norm = norm.lower() self.norm_args = norm_args if self.file_type == '.hdf5': import h5py self.hdf = h5py.File(filename, 'r') # Readonly, fie must exist if index_range == []: self.index_list = map(int, self.hdf['/spectral_data'].keys()) else: self.index_list = index_range elif self.file_type == '.imzml': from pyimzml.ImzMLParser import ImzMLParser self.imzml = ImzMLParser(filename) self.index_list = range(0, len(self.imzml.coordinates)) else: raise TypeError('File type not recogised: {}'.format( self.file_type)) self.max_index = max(self.index_list) self.coords = self.get_coords() step_size = self.get_step_size() cube = ion_datacube(step_size=step_size) cube.add_coords(self.coords) self.cube_pixel_indices = cube.pixel_indices self.cube_n_row, self.cube_n_col = cube.nRows, cube.nColumns self.histogram_mz_axis = {} self.mz_min = 9999999999999. self.mz_max = 0. self.spectrum_type = spectrum_type #todo this should be read for imzml files, not coded as an input if any([cache_spectra, do_summary]) == True: # load data into memory self.mz_list = [] self.count_list = [] self.idx_list = [] if do_summary: self.mic = np.zeros((len(self.index_list), 1)) self.tic = np.zeros((len(self.index_list), 1)) for ii in self.index_list: # load spectrum, keep values gt0 (shouldn't be here anyway) this_spectrum = self.get_spectrum(ii) mzs, counts = this_spectrum.get_spectrum(source=spectrum_type) if len(mzs) != len(counts): raise TypeError( 'length of mzs ({}) not equal to counts ({})'.format( len(mzs), len(counts))) # Enforce data limits valid = np.where((mzs > min_mz) & (mzs < max_mz) & (counts > min_int)) counts = counts[valid] mzs = mzs[valid] # record min/max if not len(mzs) == 0: if mzs[0] < self.mz_min: self.mz_min = mzs[0] if mzs[-1] > self.mz_max: self.mz_max = mzs[-1] #record summary values if do_summary: self.tic[ii] = sum(counts) self.mic[ii] = max(counts) # append ever-growing lists (should probably be preallocated or piped to disk and re-loaded) if cache_spectra: self.mz_list.append(mzs) self.count_list.append(counts) self.idx_list.append(np.ones(len(mzs), dtype=int) * ii) print 'loaded spectra' if cache_spectra: self.mz_list = np.concatenate(self.mz_list) self.count_list = np.concatenate(self.count_list) self.idx_list = np.concatenate(self.idx_list) # sort by mz for fast image formation mz_order = np.argsort(self.mz_list) self.mz_list = self.mz_list[mz_order] self.count_list = self.count_list[mz_order] self.idx_list = self.idx_list[mz_order] # split binary searches into two stages for better locality self.window_size = 1024 self.mz_sublist = self.mz_list[::self.window_size].copy() print 'file loaded' def get_step_size(self): if self.file_type == '.imzml': return [1, 1, 1] else: return [] def get_coords(self): # wrapper for redirecting requests to correct parser if self.file_type == '.imzml': coords = self.get_coords_imzml() coords[:, [0, 1]] = coords[:, [1, 0]] elif self.file_type == '.hdf5': coords = self.get_coords_hdf5() return coords def get_coords_imzml(self): # get real world coordinates print('TODO: convert indices into real world coordinates') coords = np.asarray(self.imzml.coordinates) if len(self.imzml.coordinates[0]) == 2: #2D - append zero z-coord coords = np.concatenate((coords, np.zeros((len(coords), 1))), axis=1) return coords def get_coords_hdf5(self): coords = np.zeros((len(self.index_list), 3)) for k in self.index_list: coords[k, :] = self.hdf['/spectral_data/' + str(k) + '/coordinates/'] return coords def get_spectrum(self, index): # wrapper for redirecting requests to correct parser if self.file_type == '.imzml': this_spectrum = self.get_spectrum_imzml(index) elif self.file_type == '.hdf5': this_spectrum = self.get_spectrum_hdf5(index) if self.norm != []: this_spectrum.normalise_spectrum(method=self.norm, method_args=self.norm_args) #mzs,counts = this_spectrum.get_spectrum(source="centroids") #if self.norm == 'TIC': # counts = counts / np.sum(counts) #elif self.norm == 'RMS': # counts = counts / np.sqrt(np.mean(np.square(counts))) #elif self.norm == 'MAD': # counts = counts/np.median(np.absolute(counts - np.mean(counts))) #this_spectrum.add_centroids(mzs,counts) return this_spectrum def get_spectrum_imzml(self, index): mzs, intensities = self.imzml.getspectrum(index) ## temp hack -> assume centroided this_spectrum = mass_spectrum() if self.spectrum_type == 'centroids': this_spectrum.add_centroids(mzs, intensities) else: this_spectrum.add_spectrum(mzs, intensities) return this_spectrum def get_spectrum_hdf5(self, index): import h5py this_spectrum = mass_spectrum() tmp_str = '/spectral_data/%d' % (index) try: this_spectrum.add_spectrum(self.hdf[tmp_str + '/mzs/'], self.hdf[tmp_str + '/intensities/']) got_spectrum = True except KeyError: got_spectrum = False try: this_spectrum.add_centroids( self.hdf[tmp_str + '/centroid_mzs/'], self.hdf[tmp_str + '/centroid_intensities/']) got_centroids = True except KeyError: got_centroids = False if not any([got_spectrum, got_centroids]): raise ValueError( 'No spectral data found in index {}'.format(index)) return this_spectrum def empty_datacube(self): data_out = ion_datacube() # add precomputed pixel indices data_out.coords = self.coords data_out.pixel_indices = self.cube_pixel_indices data_out.nRows = self.cube_n_row data_out.nColumns = self.cube_n_col return data_out def get_ion_image(self, mzs, tols, tol_type='ppm'): try: len(mzs) except TypeError as e: mzs = [ mzs, ] try: len(tols) except TypeError as e: tols = [ tols, ] mzs = np.asarray(mzs) tols = np.asarray(tols) data_out = self.empty_datacube() def search_sort(mzs, tols): data_out = blank_dataout() idx_left = np.searchsorted(self.mz_list, mzs - tols, 'l') idx_right = np.searchsorted(self.mz_list, mzs + tols, 'r') for mz, tol, il, ir in zip(mzs, tols, idx_left, idx_right): if any((mz < self.mz_list[0], mz > self.mz_list[-1])): data_out.add_xic( np.zeros(np.shape(self.cube_pixel_indices)), [mz], [tol]) continue # slice list for code clarity mz_vect = self.mz_list[il:ir] idx_vect = self.idx_list[il:ir] count_vect = self.count_list[il:ir] # bin vectors ion_vect = np.bincount(idx_vect, weights=count_vect, minlength=self.max_index + 1) data_out.add_xic(ion_vect, [mz], [tol]) return data_out def search_bisect(mzs, tols): data_out = blank_dataout() for mz, tol in zip(mzs, tols): if any((mz < self.mz_list[0], mz > self.mz_list[-1])): data_out.add_xic( np.zeros(np.shape(self.cube_pixel_indices)), [mz], [tol]) continue mz_upper = mz + tol mz_lower = mz - tol il = bisect.bisect_left(self.mz_list, mz_lower) ir = bisect.bisect_right(self.mz_list, mz_upper) # slice list for code clarity mz_vect = self.mz_list[il:ir] idx_vect = self.idx_list[il:ir] count_vect = self.count_list[il:ir] # bin vectors ion_vect = np.bincount(idx_vect, weights=count_vect, minlength=self.max_index + 1) data_out.add_xic(ion_vect, [mz], [tol]) return data_out if len(tols) == 1: tols = tols * np.ones(np.shape(mzs)) if type(mzs) not in (np.ndarray, list): mzs = np.asarray([ mzs, ]) if tol_type == 'ppm': tols = tols * mzs / 1e6 # to m/z # Fast search for insertion point of mz in self.mz_list # First stage is looking for windows using the sublist idx_left = np.searchsorted(self.mz_sublist, mzs - tols, 'l') idx_right = np.searchsorted(self.mz_sublist, mzs + tols, 'r') for mz, tol, il, ir in zip(mzs, tols, idx_left, idx_right): l = max(il - 1, 0) * self.window_size r = ir * self.window_size # Second stage is binary search within the windows il = l + np.searchsorted(self.mz_list[l:r], mz - tol, 'l') ir = l + np.searchsorted(self.mz_list[l:r], mz + tol, 'r') # slice list for code clarity mz_vect = self.mz_list[il:ir] idx_vect = self.idx_list[il:ir] count_vect = self.count_list[il:ir] # bin vectors ion_vect = np.bincount(idx_vect, weights=count_vect, minlength=self.max_index + 1) data_out.add_xic(ion_vect, [mz], [tol]) return data_out # Form histogram axis def generate_histogram_axis(self, ppm=1.): ppm_mult = ppm * 1e-6 mz_current = self.mz_min mz_list = [ mz_current, ] while mz_current <= self.mz_max: mz_current = mz_current + mz_current * ppm_mult mz_list.append(mz_current) self.histogram_mz_axis[ppm] = mz_list def get_histogram_axis(self, ppm=1.): try: mz_axis = self.histogram_mz_axis[ppm] except KeyError as e: print 'generating histogram axis for ppm {}'.format(ppm) self.generate_histogram_axis(ppm=ppm) return self.histogram_mz_axis[ppm] def generate_summary_spectrum(self, summary_type='mean', ppm=1., hist_axis=[]): if hist_axis == []: hist_axis = self.get_histogram_axis(ppm=ppm) # calcualte mean along some m/z axis mean_spec = np.zeros(np.shape(hist_axis)) for ii in range(0, len(hist_axis) - 1): mz_upper = hist_axis[ii + 1] mz_lower = hist_axis[ii] idx_left = bisect.bisect_left(self.mz_list, mz_lower) idx_right = bisect.bisect_right(self.mz_list, mz_upper) # slice list for code clarity count_vect = self.count_list[idx_left:idx_right] if summary_type == 'mean': count_vect = self.count_list[idx_left:idx_right] mean_spec[ii] = np.sum(count_vect) elif summary_type == 'freq': idx_vect = self.idx_list[idx_left:idx_right] mean_spec[ii] = float(len(np.unique(idx_vect))) else: raise ValueError( 'Summary type not recognised; {}'.format(summary_type)) if summary_type == 'mean': mean_spec = mean_spec / len(self.index_list) elif summary_type == 'freq': mean_spec = mean_spec / len(self.index_list) return hist_axis, mean_spec def get_summary_image(self, summary_func='tic'): if summary_func not in ['tic', 'mic']: raise KeyError("requested type not in 'tic' mic'") #data_out = ion_datacube() # add precomputed pixel indices #data_out.coords = self.coords #data_out.pixel_indices = self.cube_pixel_indices #data_out.nRows = self.cube_n_row #data_out.nColumns = self.cube_n_col data_out = self.empty_datacube() data_out.add_xic(np.asarray(getattr(self, summary_func)), [0], [0]) return data_out
class inMemoryIMS(): def __init__(self, filename, min_mz=0., max_mz=np.inf, min_int=0., index_range=[],cache_spectra=True,do_summary=True,norm=''): file_size = os.path.getsize(filename) self.load_file(filename, min_mz, max_mz, min_int, index_range=index_range,cache_spectra=cache_spectra,do_summary=do_summary,norm=norm) def load_file(self, filename, min_mz=0, max_mz=np.inf, min_int=0, index_range=[],cache_spectra=True,do_summary=True,norm=[]): # parse file to get required parameters # can use thin hdf5 wrapper for getting data from file self.file_dir, self.filename = os.path.split(filename) self.filename, self.file_type = os.path.splitext(self.filename) self.file_type = self.file_type.lower() self.norm=norm if self.file_type == '.hdf5': import h5py self.hdf = h5py.File(filename, 'r') # Readonly, fie must exist if index_range == []: self.index_list = map(int, self.hdf['/spectral_data'].keys()) else: self.index_list = index_range elif self.file_type == '.imzml': from pyimzml.ImzMLParser import ImzMLParser self.imzml = ImzMLParser(filename) self.index_list=range(0,len(self.imzml.coordinates)) else: raise TypeError('File type not recogised: {}'.format(self.file_type)) self.max_index = max(self.index_list) self.coords = self.get_coords() step_size = self.get_step_size() cube = ion_datacube(step_size=step_size) cube.add_coords(self.coords) self.cube_pixel_indices = cube.pixel_indices self.cube_n_row, self.cube_n_col = cube.nRows, cube.nColumns self.histogram_mz_axis = {} self.mz_min = 9999999999999. self.mz_max = 0. if any([cache_spectra,do_summary]) == True: # load data into memory self.mz_list = [] self.count_list = [] self.idx_list = [] if do_summary: self.mic=np.zeros((len(self.index_list),1)) self.tic=np.zeros((len(self.index_list),1)) for ii in self.index_list: # load spectrum, keep values gt0 (shouldn't be here anyway) this_spectrum = self.get_spectrum(ii) mzs, counts = this_spectrum.get_spectrum(source='centroids') if len(mzs) != len(counts): raise TypeError('length of mzs ({}) not equal to counts ({})'.format(len(mzs), len(counts))) # Enforce data limits valid = np.where((mzs > min_mz) & (mzs < max_mz) & (counts > min_int)) counts = counts[valid] mzs = mzs[valid] # record min/max if mzs[0]<self.mz_min: self.mz_min = mzs[0] if mzs[-1]>self.mz_max: self.mz_max = mzs[-1] # append ever-growing lists (should probably be preallocated or piped to disk and re-loaded) if cache_spectra: self.mz_list.append(mzs) self.count_list.append(counts) self.idx_list.append(np.ones(len(mzs), dtype=int) * ii) #record summary values if do_summary: self.tic[ii]=sum(counts) self.mic[ii]=max(counts) print 'loaded spectra' if cache_spectra: self.mz_list = np.concatenate(self.mz_list) self.count_list = np.concatenate(self.count_list) self.idx_list = np.concatenate(self.idx_list) # sort by mz for fast image formation mz_order = np.argsort(self.mz_list) self.mz_list = self.mz_list[mz_order] self.count_list = self.count_list[mz_order] self.idx_list = self.idx_list[mz_order] # split binary searches into two stages for better locality self.window_size = 1024 self.mz_sublist = self.mz_list[::self.window_size].copy() print 'file loaded' def get_step_size(self): if self.file_type == '.imzml': return [1,1,1] else: return [] def get_coords(self): # wrapper for redirecting requests to correct parser if self.file_type == '.imzml': coords = self.get_coords_imzml() coords[:,[0, 1]] = coords[:,[1, 0]] elif self.file_type == '.hdf5': coords = self.get_coords_hdf5() return coords def get_coords_imzml(self):# get real world coordinates print('TODO: convert indices into real world coordinates') coords = np.asarray(self.imzml.coordinates) if len(self.imzml.coordinates[0]) == 2: #2D - append zero z-coord coords = np.concatenate((coords,np.zeros((len(coords),1))),axis=1) return coords def get_coords_hdf5(self): coords = np.zeros((len(self.index_list), 3)) for k in self.index_list: coords[k, :] = self.hdf['/spectral_data/' + str(k) + '/coordinates/'] return coords def get_spectrum(self,index): # wrapper for redirecting requests to correct parser if self.file_type == '.imzml': this_spectrum = self.get_spectrum_imzml(index) elif self.file_type == '.hdf5': this_spectrum = self.get_spectrum_hdf5(index) if self.norm != []: mzs,counts = this_spectrum.get_spectrum(source="centroids") if self.norm == 'TIC': counts = counts / np.sum(counts) elif self.norm == 'RMS': counts = counts / np.sqrt(np.mean(np.square(counts))) elif self.norm == 'MAD': counts = counts/np.median(np.absolute(counts - np.mean(counts))) this_spectrum.add_centroids(mzs,counts) return this_spectrum def get_spectrum_imzml(self,index): mzs, intensities = self.imzml.getspectrum(index) ## temp hack -> assume centroided this_spectrum = mass_spectrum() this_spectrum.add_centroids(mzs,intensities) return this_spectrum def get_spectrum_hdf5(self, index): import h5py this_spectrum = mass_spectrum() tmp_str = '/spectral_data/%d' % (index) try: this_spectrum.add_spectrum(self.hdf[tmp_str + '/mzs/'], self.hdf[tmp_str + '/intensities/']) got_spectrum = True except KeyError: got_spectrum = False try: this_spectrum.add_centroids(self.hdf[tmp_str + '/centroid_mzs/'], self.hdf[tmp_str + '/centroid_intensities/']) got_centroids = True except KeyError: got_centroids = False if not any([got_spectrum, got_centroids]): raise ValueError('No spectral data found in index {}'.format(index)) return this_spectrum def empty_datacube(self): data_out = ion_datacube() # add precomputed pixel indices data_out.coords = self.coords data_out.pixel_indices = self.cube_pixel_indices data_out.nRows = self.cube_n_row data_out.nColumns = self.cube_n_col return data_out def get_ion_image(self, mzs, tols, tol_type='ppm'): data_out = self.empty_datacube() def search_sort(mzs,tols): data_out = blank_dataout() idx_left = np.searchsorted(self.mz_list, mzs - tols, 'l') idx_right = np.searchsorted(self.mz_list, mzs + tols, 'r') for mz, tol, il, ir in zip(mzs, tols, idx_left, idx_right): if any((mz<self.mz_list[0],mz>self.mz_list[-1])): data_out.add_xic(np.zeros(np.shape(self.cube_pixel_indices)), [mz], [tol]) continue # slice list for code clarity mz_vect=self.mz_list[il:ir] idx_vect = self.idx_list[il:ir] count_vect = self.count_list[il:ir] # bin vectors ion_vect = np.bincount(idx_vect, weights=count_vect, minlength=self.max_index + 1) data_out.add_xic(ion_vect, [mz], [tol]) return data_out def search_bisect(mzs,tols): data_out = blank_dataout() for mz,tol in zip(mzs,tols): if any((mz<self.mz_list[0],mz>self.mz_list[-1])): data_out.add_xic(np.zeros(np.shape(self.cube_pixel_indices)), [mz], [tol]) continue mz_upper = mz + tol mz_lower = mz - tol il = bisect.bisect_left(self.mz_list,mz_lower) ir = bisect.bisect_right(self.mz_list,mz_upper) # slice list for code clarity mz_vect=self.mz_list[il:ir] idx_vect = self.idx_list[il:ir] count_vect = self.count_list[il:ir] # bin vectors ion_vect = np.bincount(idx_vect, weights=count_vect, minlength=self.max_index + 1) data_out.add_xic(ion_vect, [mz], [tol]) return data_out if type(mzs) not in (np.ndarray, list): mzs = np.asarray([mzs, ]) if tol_type == 'ppm': tols = tols * mzs / 1e6 # to m/z # Fast search for insertion point of mz in self.mz_list # First stage is looking for windows using the sublist idx_left = np.searchsorted(self.mz_sublist, mzs - tols, 'l') idx_right = np.searchsorted(self.mz_sublist, mzs + tols, 'r') for mz, tol, il, ir in zip(mzs, tols, idx_left, idx_right): l = max(il - 1, 0) * self.window_size r = ir * self.window_size # Second stage is binary search within the windows il = l + np.searchsorted(self.mz_list[l:r], mz - tol, 'l') ir = l + np.searchsorted(self.mz_list[l:r], mz + tol, 'r') # slice list for code clarity mz_vect=self.mz_list[il:ir] idx_vect = self.idx_list[il:ir] count_vect = self.count_list[il:ir] # bin vectors ion_vect = np.bincount(idx_vect, weights=count_vect, minlength=self.max_index + 1) data_out.add_xic(ion_vect, [mz], [tol]) return data_out # Form histogram axis def generate_histogram_axis(self, ppm=1.): ppm_mult = ppm * 1e-6 mz_current = self.mz_min mz_list = [mz_current,] while mz_current <= self.mz_max: mz_current = mz_current + mz_current * ppm_mult mz_list.append(mz_current) self.histogram_mz_axis[ppm] = mz_list def get_histogram_axis(self, ppm=1.): try: mz_axis = self.histogram_mz_axis[ppm] except KeyError as e: print 'generating histogram axis for ppm {}'.format(ppm) self.generate_histogram_axis(ppm=ppm) return self.histogram_mz_axis[ppm] def generate_summary_spectrum(self, summary_type='mean', ppm=1.): hist_axis = self.get_histogram_axis(ppm=ppm) # calcualte mean along some m/z axis mean_spec = np.zeros(np.shape(hist_axis)) for ii in range(0, len(hist_axis) - 1): mz_upper = hist_axis[ii + 1] mz_lower = hist_axis[ii] idx_left = bisect.bisect_left(self.mz_list, mz_lower) idx_right = bisect.bisect_right(self.mz_list, mz_upper) # slice list for code clarity count_vect = self.count_list[idx_left:idx_right] if summary_type == 'mean': count_vect = self.count_list[idx_left:idx_right] mean_spec[ii] = np.sum(count_vect) elif summary_type == 'freq': idx_vect = self.idx_list[idx_left:idx_right] mean_spec[ii] = float(len(np.unique(idx_vect))) else: raise ValueError('Summary type not recognised; {}'.format(summary_type)) if summary_type == 'mean': mean_spec = mean_spec / len(self.index_list) elif summary_type == 'freq': mean_spec = mean_spec / len(self.index_list) return hist_axis, mean_spec def get_summary_image(self,summary_func='tic'): if summary_func not in ['tic','mic']: raise KeyError("requested type not in 'tic' mic'") data_out = ion_datacube() # add precomputed pixel indices data_out.coords = self.coords data_out.pixel_indices = self.cube_pixel_indices data_out.nRows = self.cube_n_row data_out.nColumns = self.cube_n_col data_out.add_xic(np.asarray(getattr(self, summary_func))[self.index_list], [0], [0]) return data_out
print(len(np.unique(image_UPGMA_pixel1))) cluster2concensus = {} cluster2comparison = {} for cluster in np.unique(image_UPGMA_pixel1): print(cluster) cluster2concensus[cluster] = consensus.get_consensus( cluster, image_UPGMA_pixel1, dist_dot_product, ids, imzMLfile, xs, ys) cluster_ids = consensus.get_cluster_elements(cluster, image_UPGMA_pixel1, parser, xs, ys) tmp = list() for i in cluster_ids: tmp.append( 1 - (get_similarity(cluster2concensus[cluster], consensus.tupel2map(parser.getspectrum(i))))) cluster2comparison[cluster] = tmp consensus_distance = np.zeros( (len(cluster2concensus.keys()), len(cluster2concensus.keys()))) for cluster1 in range(len(cluster2concensus.keys())): for cluster2 in range(cluster1, len(cluster2concensus.keys())): consensus_distance[cluster1, cluster2] = consensus_distance[ cluster2, cluster1] = 1 - get_similarity( cluster2concensus[cluster1], cluster2concensus[cluster2]) fig = plt.figure() grid = plt.GridSpec(len(np.unique(image_UPGMA_pixel1)), 3, wspace=0.1, hspace=0.1)
def __compute_file_info(cls, filename, resolution): ## TODO completely refactor this to make it smartly handle profile or centroid datasets ## TODO: centroid datasets should take in a user parameter "Resolution" and resample data at that resolution ## TODO: profile datasets should work as is ## TODO: checks for profile data vs. centroid data on the variation in length of ['m/z array'] """ Internal helper function used to compute the mz axis, data type for the intensities, format type :return: Numpy array with mz axis :return: string with data type :return: imzml file type :return: """ reader = ImzMLParser(filename) # Read the first spectrum mz_axes, intens = reader.getspectrum(0) # NOTE: mz_axes is a tuple # Read the coordinates coordinates = np.asarray(reader.coordinates) # #Start the data at [0,0,0] # coordinates[:,0] = coordinates[:,0] - np.amin(coordinates,axis=0)[0] # coordinates[:,1] = coordinates[:,1] - np.amin(coordinates,axis=0)[1] # coordinates[:,2] = coordinates[:,2] - np.amin(coordinates,axis=0)[2] # Determine the data type for the internsity values dtype = np.asarray(intens).dtype.str # Compute the mz axis and file type file_type = cls.available_imzml_types['continuous'] min_mz, max_mz = np.amin(mz_axes), np.amax(mz_axes) for ind in range(coordinates.shape[0]): #for ind, loc in enumerate(reader.coordinates): mz, intens = reader.getspectrum(ind) if mz == mz_axes: pass else: file_type = cls.available_imzml_types['processed'] if min_mz > np.amin(mz): min_mz = np.amin(mz) if max_mz < np.amax(mz): max_mz = np.amax(mz) # Reinterpolate the mz-axis if we have a processed mode imzml file if file_type == cls.available_imzml_types['processed']: f = np.ceil(1e6 * np.log(max_mz/min_mz)/resolution) mz_axes = np.logspace(np.log10(min_mz), np.log10(max_mz), f) log_helper.info(__name__, "Reinterpolated m/z axis for processed imzML file") # Construct the imzml metadata information dataset_metadata = metadata_dict() instrument_metadata = metadata_dict() method_metadata = metadata_dict() for k, v in reader.imzmldict.iteritems(): dataset_metadata[k] = metadata_value(name=k, value=v, unit=None, description=k, ontology=None) # Delete the parser and read the metadata del reader # Parse the metadata for the file. We try to parse only the header and ignore the # <run > group in the XML file to avoid going throught the whole file again # while extracting the majority of the relevant metadata try: with open(filename, 'r') as ins: metdata_header = '' for line in ins: if '<run' in line: break else: metdata_header += line metdata_header += '</mzML>' metdata_header_dict = xmltodict.parse(metdata_header)['mzML'] for k, v in metdata_header_dict.iteritems(): store_value = metadata_value(name=k, value=v, unit=None, description=str(k) + " extracted from imzML XML header.", ontology=None) if k == 'instrumentConfigurationList': instrument_metadata[k] = store_value elif k == 'dataProcessingList': method_metadata[k] = store_value elif k == 'scanSettingsList': dataset_metadata[k] = store_value elif k == 'softwareList': method_metadata[k] = store_value elif k =='sampleList': method_metadata[k] = store_value else: dataset_metadata[k] = store_value dataset_metadata['imzml_xml_metadata_header'] = metadata_value(name='imzml_xml_metadata_header', value=metdata_header, unit=None, description='XML imzML header', ontology=None) except: log_helper.warning(__name__, "Extraction of additional imzML metadata failed") return coordinates, np.asarray(mz_axes), dtype, file_type, dataset_metadata, instrument_metadata, method_metadata
class ImzMLHandler: # cropToData: (if True) Remove all rows and columns which contain no data. For some datasets (e.g. Bruker) # the coordinates stored are relative to some external coordinate system, and therefore large # amounts of empty space can be present def __init__(self, filename, startX=1, startY=1, width=None, height=None, cropToData=False): self.imzML = ImzMLParser(filename) # Find the min and max row and column where data is present maxWidth = 0 maxHeight = 0 minWidth = -1 minHeight = -1 for (x, y, z) in self.imzML.coordinates: if x > maxWidth: maxWidth = x if y > maxHeight: maxHeight = y if minWidth == -1 or minWidth > x: minWidth = x if minHeight == -1 or minHeight > y: minHeight = y if cropToData: startX = minWidth startY = minHeight if width is None: width = maxWidth - startX + 1 if height is None: height = maxHeight - startY + 1 self.startX = startX self.startY = startY self.width = width self.height = height self.coordinates = [] self.cropToData = cropToData self.indexImage = np.ones((height, width), dtype=np.int) * -1 index = 0 for (x, y, z) in self.imzML.coordinates: if x >= startX and y >= startY and x < (startX + width) and y < ( startY + height): if cropToData: self.coordinates.append( (index, x - minWidth + 1, y - minHeight + 1)) self.indexImage[y - minHeight, x - minWidth] = index else: self.coordinates.append((index, x, y)) self.indexImage[y - startY, x - startX] = index index = index + 1 def getSpectrumWithIndex(self, index): return self.imzML.getspectrum(index) def getSpectrumAt(self, x, y): if x <= 0 or y <= 0: raise ValueError( 'Both x and y must be positive (> 0) integers, as per .imzML specification.' ) return self.imzML.getspectrum(self.indexImage[y - 1, x - 1]) def getTICImage(self): ticImage = np.zeros((self.height, self.width)) for index, x, y in self.coordinates: mzs, counts = self.imzML.getspectrum(index) #(x, y, z) = imzML.coordinates[index] if self.cropToData: ticImage[y - 1, x - 1] = np.sum(counts) else: ticImage[y - self.startY, x - self.startX] = np.sum(counts) return ticImage def determineMinMaxMZ(self, pixelsToSample=100): # TODO: Check in the metadata # Alternatively, sample some pixels and see what the min and max recorded # m/z values are minMZ = -1 maxMZ = 0 for i in range(pixelsToSample): spectrumToSample = random.randint(0, len(self.coordinates) - 1) (index, x, y) = self.coordinates[spectrumToSample] mzs, counts = self.imzML.getspectrum(index) if minMZ == -1 or mzs[0] < minMZ: minMZ = mzs[0] if maxMZ < mzs[len(mzs) - 1]: maxMZ = mzs[len(mzs) - 1] return minMZ, maxMZ def estimatePPM(self, minMZ, maxMZ, numBins=10, pixelsToSample=100): ppmEstimates = np.ones(numBins) * 1e5 for i in range(pixelsToSample): spectrumToSample = random.randint(0, len(self.coordinates) - 1) (index, x, y) = self.coordinates[spectrumToSample] mzs, counts = self.imzML.getspectrum(index) diff = mzs[1:len(mzs)] - mzs[0:len(mzs) - 1] ppms = diff * 1e6 / mzs[0:len(mzs) - 1] binWidth = (maxMZ - minMZ) / numBins for binNum in range(numBins): startMZ = minMZ + (binNum * binWidth) endMZ = minMZ + ((binNum + 1) * binWidth) possiblePPMs = ppms[np.logical_and( mzs[0:len(mzs) - 1] >= startMZ, mzs[0:len(mzs) - 1] < endMZ)] if len(possiblePPMs) > 0: ppmEstimate = np.min(possiblePPMs) if ppmEstimates[binNum] > ppmEstimate: ppmEstimates[binNum] = ppmEstimate return ppmEstimates def generateMeanSpectrum(self, startmz, endmz, ppm): self.mzAxis = ImzMLHandler.generateMZAxis(startmz, endmz, ppm) spectrum = np.zeros((self.mzAxis.shape[0] - 1)) startLog = np.log(self.mzAxis[0]) ppmLog = np.log(1 + ppm * 1e-6) for index, x, y in self.coordinates: if index % 10 == 0: mzs, counts = self.imzML.getspectrum(index) for mzIndex in range(len(mzs)): location = int( np.round((np.log(mzs[mzIndex]) - startLog) / ppmLog)) if location < 0: continue if location >= len(spectrum): break spectrum[location] += counts[mzIndex] self.meanSpectrum = spectrum / len(self.coordinates) return self.meanSpectrum def generateIonImage(self, mz, ppm): ionImage = np.zeros((self.height, self.width)) deltamz = ppm * 1e-6 * mz minmz = mz - deltamz maxmz = mz + deltamz for index, x, y in self.coordinates: mzs, counts = self.imzML.getspectrum(index) for mzIndex in range(len(mzs)): if mzs[mzIndex] > maxmz: break if mzs[mzIndex] >= minmz and mzs[mzIndex] <= maxmz: ionImage[y - 1, x - 1] += counts[mzIndex] return ionImage def generateIonImages(self, mzsToGenerate, ppm): mzsToGenerate = np.array(mzsToGenerate) ionImages = np.zeros((self.height, self.width, len(mzsToGenerate))) deltamz = ppm * 1e-6 * mzsToGenerate minmz = mzsToGenerate - deltamz maxmz = mzsToGenerate + deltamz for index, x, y in self.coordinates: mzs, counts = self.imzML.getspectrum(index) for l in range(len(mzsToGenerate)): ionImages[y - 1, x - 1, l] = np.sum(counts[np.logical_and( mzs > minmz[l], mzs <= maxmz[l])]) return ionImages def generateDatacubeMZs(self, limits, ticNorm=False): datacube = np.zeros((len(self.coordinates), len(limits))) spectrumIndex = 0 for index, x, y in self.coordinates: mzs, counts = self.imzML.getspectrum(index) # Normalised to TIC if ticNorm: counts = counts / np.sum(counts) for l in range(len(limits)): datacube[spectrumIndex, l] = np.sum(counts[np.logical_and( mzs > limits[l, 0], mzs <= limits[l, 1])]) spectrumIndex += 1 self.datacube = datacube return self.datacube def generateDatacube(self, peaks, left_ips, right_ips, ticNorm=False): #left_ips = peakProperties['left_ips'] left_ips = np.floor(left_ips).astype(np.int) - 1 #right_ips = peakProperties['right_ips'] right_ips = np.ceil(right_ips).astype(np.int) + 1 datacube = np.zeros((len(self.coordinates), len(peaks))) spectrumIndex = 0 for index, x, y in self.coordinates: mzs, counts = self.imzML.getspectrum(index) # Normalised to TIC if ticNorm: counts = counts / np.sum(counts) curPeakIndex = 0 for mzIndex in range(len(mzs)): while curPeakIndex < len(peaks) and mzs[mzIndex] > self.mzAxis[ right_ips[curPeakIndex]]: curPeakIndex += 1 if curPeakIndex >= len(peaks): break for peakIndex in range(curPeakIndex, len(peaks)): if mzs[mzIndex] < self.mzAxis[left_ips[peakIndex]]: break if mzs[mzIndex] >= self.mzAxis[left_ips[ peakIndex]] and mzs[mzIndex] <= self.mzAxis[ right_ips[peakIndex]]: datacube[spectrumIndex, peakIndex] += counts[mzIndex] break spectrumIndex += 1 self.datacube = datacube return self.datacube def determineCorrelatedFeatures(self, clusteringThreshold): ionCorrelationMatrix = np.corrcoef(self.datacube.transpose()) ionCorrelationMatrix[np.isnan(ionCorrelationMatrix)] = 0 clustering = AgglomerativeClustering( n_clusters=None, distance_threshold=clusteringThreshold).fit(ionCorrelationMatrix) u, c = np.unique(clustering.labels_, return_counts=True) #nonUniqueClusters = np.where(c > 1)[0] self.uniqueData = [] self.uniqueDataMembers = [] for index in u: clusterIndex = u[index] #print(clusterIndex) clusterMembers = np.where(clustering.labels_ == clusterIndex)[0] self.uniqueDataMembers.append(clusterMembers) averageIonImage = np.mean(self.datacube[:, clusterMembers], axis=1) averageIonImage = np.reshape(averageIonImage, (len(averageIonImage), 1)) if self.uniqueData == []: self.uniqueData = averageIonImage else: self.uniqueData = np.concatenate( (self.uniqueData, averageIonImage), axis=1) return self.uniqueData @staticmethod def generateMZAxis(startmz, endmz, ppm): numElements = int( np.ceil( (np.log(endmz) - np.log(startmz)) / np.log(1 + ppm * 1e-6))) mzAxis = np.zeros((numElements)) for i in range(numElements): mzAxis[i] = startmz * np.power(1 + ppm * 1e-6, i) return mzAxis
order = mzs.argsort() return mzs[order], intensities[order] def saveStatistics(self, filename): def toRect(d): xs = [k[0] for k in d] ys = [k[1] for k in d] img = np.zeros((max(xs) + 1, max(ys) + 1)) for k in d: img[k[0], k[1]] = d[k] return img with open(filename, "w+") as f: np.savez(f, real=toRect(self._norm_real), simulated=toRect(self._norm_simulated), groundtruth=toRect(self._norm_groundtruth), noise=toRect(self._norm_noise), diff=toRect(self._norm_diff)) ng = NoiseGenerator(args.nmf, args.layers, args.real) imzml_sim = ImzMLParser(args.simclean) with ImzMLWriter(args.output, mz_dtype=np.float32) as w: for i, coords in enumerate(imzml_sim.coordinates): noisy_mzs, noisy_intensities = ng.addNoise(imzml_sim.getspectrum(i), coords) w.addSpectrum(noisy_mzs, noisy_intensities, coords) ng.saveStatistics(args.output + ".norms")
class NoiseGenerator(object): def __init__(self, nmf_fn, layers_fn, imzml_fn): self._imzml = ImzMLParser(imzml_fn) with np.load(nmf_fn) as data: nx, ny = data['shape'] self._W = data['W'].reshape((nx, ny, -1)) self._H = data['H'] self._mz_axis = data['mz_axis'] self._norm_real = {} self._norm_simulated = {} self._norm_groundtruth = {} self._norm_noise = {} self._norm_diff = {} self._coords = {} for i, coords in enumerate(self._imzml.coordinates): self._coords[(coords[0], coords[1])] = i self._mz_bins = [] for mz, ppm in self._mz_axis: self._mz_bins.append(mz * (1.0 + 1e-6 * ppm)) # self._removeAssignedBins(layers_fn) def _removeAssignedBins(self, layers_fn): # buggy at the moment with open(layers_fn, 'rb') as f: layers = cPickle.load(f) for i in layers['layers_list']: assigned = layers['layers_list'][i]['assigned_mz_bins'] assigned = assigned[assigned < self._H[i].shape[0]] print "#assigned bins in component #{}: {}".format(i + 1, len(assigned)) h = np.zeros_like(self._H[i]) h[assigned] = self._H[i][assigned] self._H[i] = h def _getRealSpectrum(self, x, y): return self._imzml.getspectrum(self._coords[(x, y)]) def _norm(self, intensities): return np.linalg.norm(intensities) def generateNoise(self, x, y): real_spectrum = self._getRealSpectrum(x, y) real_mzs, real_intensities = map(np.array, real_spectrum) min_mz, max_mz = self._mz_bins[0], self._mz_bins[-1] inside_range = (real_mzs >= min_mz) & (real_mzs <= max_mz) real_mzs = real_mzs[inside_range] real_intensities = real_intensities[inside_range] bins = np.digitize(real_mzs, self._mz_bins) n_bins = len(self._mz_bins) binned_real_intensities = np.bincount(bins, real_intensities, n_bins) self._norm_real[(x, y)] = self._norm(binned_real_intensities) binned_approx_intensities = self._W[x, y, :].dot(self._H) noise = np.abs(binned_real_intensities - binned_approx_intensities) # FIXME: avoid duplicating noise noise_intensities = noise[bins] * args.inflate_noise noise_mzs = np.array(real_mzs) nnz = noise_intensities > min(real_intensities) / 2 return noise_mzs[nnz], noise_intensities[nnz] def addNoise(self, profile_spectrum, coords): spec = map(np.array, profile_spectrum) p = centroidize(*spec) mzs = np.array(p.masses) mult = spec[1].max() if len(spec[1]) > 0 else 1 intensities = np.array(p.abundances) * mult x, y = coords[:2] limit = min(self._getRealSpectrum(*coords)[1]) noise_mzs, noise_intensities = self.generateNoise(*coords) self._norm_noise[(x, y)] = self._norm(noise_intensities[noise_intensities > limit]) self._norm_groundtruth[(x, y)] = self._norm(intensities[intensities > limit]) self._norm_simulated[(x, y)] = self._norm_noise[(x, y)] + self._norm_groundtruth[(x, y)] self._norm_diff[(x, y)] = abs(self._norm_simulated[(x, y)] - self._norm_real[(x, y)]) mzs = np.concatenate([mzs, noise_mzs]) intensities = np.concatenate([intensities, noise_intensities]) detectable = np.where(intensities > limit)[0] mzs = mzs[detectable] intensities = intensities[detectable] order = mzs.argsort() return mzs[order], intensities[order] def saveStatistics(self, filename): def toRect(d): xs = [k[0] for k in d] ys = [k[1] for k in d] img = np.zeros((max(xs) + 1, max(ys) + 1)) for k in d: img[k[0], k[1]] = d[k] return img with open(filename, "w+") as f: np.savez(f, real=toRect(self._norm_real), simulated=toRect(self._norm_simulated), groundtruth=toRect(self._norm_groundtruth), noise=toRect(self._norm_noise), diff=toRect(self._norm_diff))