Exemplo n.º 1
0
def imzml_to_sbd(filepath_imzml, filepath_sbd):
    """Converts a pair of .imzml and .ibd files to .sbd   
      Returns:
      list:True on success    
      """    
    with open(filepath_sbd, 'wb') as out_file:
        p = ImzMLParser(filepath_imzml)
        n_spectra = len(p.coordinates)
        
        # First pass
        meta = []
        offset = 20 * n_spectra + 10       
        for idx, (x,y,z) in enumerate(p.coordinates):
            (mzs, intensities) = p.getspectrum(idx)
            n_points = len(mzs)
            
            meta.append((offset, n_points, np.sum(intensities), x, y))
            offset = offset + n_points * 12
        
        # Write data to stream...
        header = (0, n_spectra, 8)    
        out_file.write(struct.pack('<BQB', header[0], header[1], header[2]))    
        
        for meta_item in meta:
            out_file.write(struct.pack('<QLfHH',
                                       meta_item[0], meta_item[1], 
                                       meta_item[2], meta_item[3],
                                       meta_item[4]))
         
        # Second pass    
        for i in range(n_spectra):
            mzs, intensities = p.getspectrum(i)
            write_spectrum(out_file, (mzs, intensities)) 
    
    return True
Exemplo n.º 2
0
class IMSDataset:
    def __init__(self, fpath, micro_res=0.5, IMS_res=10):
        self.parser = ImzMLParser(fpath)
        self.micro_res = micro_res
        self.IMS_res = IMS_res
        self.IMS_px_in_micro = IMS_res / micro_res

    def __get_min_max_coords(self):
        coords = np.array(self.parser.coordinates)
        x_min, y_min, _ = np.min(coords, axis=0)
        x_max, y_max, _ = np.max(coords, axis=0)
        return x_min, y_min, x_max, y_max

    def to_columnar(self, mz_precision=4, dtype="uint32"):
        mzs, _ = self.parser.getspectrum(0)
        coords = np.array(dataset.parser.coordinates)
        x, y, _ = coords.T

        coords_df = pd.DataFrame(
            {
                "x": x,
                "y": y,
                "micro_x_topleft": x * self.IMS_px_in_micro - self.IMS_px_in_micro,
                "micro_y_topleft": y * self.IMS_px_in_micro - self.IMS_px_in_micro,
                "micro_px_width": np.repeat(self.IMS_px_in_micro, len(coords)),
            },
            dtype=dtype,
        )

        intensities = np.zeros((len(coords_df), len(mzs)))
        for i in range(len(coords)):
            _, coord_intensities = self.parser.getspectrum(i)
            intensities[i, :] = coord_intensities

        intensities = pd.DataFrame(
            intensities, columns=np.round(mzs, mz_precision).astype(str), dtype=dtype
        )

        return coords_df.join(intensities)

    def to_array(self):
        x_min, y_min, x_max, y_max = self.__get_min_max_coords()
        mz_lengths = self.parser.mzLengths
        if not (mz_lengths.count(mz_lengths[0]) == len(mz_lengths)):
            raise ValueError("The number of m/z is not the same at each coordinate.")

        arr = np.zeros((x_max - x_min + 1, y_max - y_min + 1, mz_lengths[0]))

        for idx, (x, y, _) in enumerate(self.parser.coordinates):
            _, intensities = self.parser.getspectrum(idx)
            arr[x - x_min, y - y_min, :] = intensities

        return arr

    def write_zarr(self, path, dtype="i4"):
        arr = self.to_array()
        z_arr = zarr.open(path, mode="w", shape=arr.shape, compressor=None, dtype=dtype)
        z_arr[:, :, :] = arr
def get_ds_spots(ds_id):
    parser = ImzMLParser(f'raw_datasets/{ds_id}.imzML')
    grid_mask = np.load(f'spotting/grids/{ds_id}.npy')
    mask_names = json.load(open(f'spotting/grids/{ds_id}_mask_names.json'))

    # Make a mapping of coordinate -> spectrum index
    coords = np.array(parser.coordinates)[:, :2]
    base_coord = np.min(coords, axis=0)
    coord_to_idx = np.ones(np.max(coords, axis=0) - base_coord + 1,
                           dtype='i') * -1
    for i, (x, y) in enumerate(coords):
        coord_to_idx[x - base_coord[0], y - base_coord[1]] = i

    # Collect spectra for each mask item
    spots = {}
    for i, mask_name in enumerate(mask_names):
        if mask_name != 'background':
            spectra_ys, spectra_xs = np.nonzero(grid_mask == i)
            spectra = [
                parser.getspectrum(idx)
                for idx in coord_to_idx[spectra_xs, spectra_ys]
            ]
            norm_spectra = [(mzs, ints * 1e6 / np.sum(ints))
                            for mzs, ints in spectra]
            mzs, ints = merge_spectra(norm_spectra)
            spots[mask_name] = mzs, ints, len(norm_spectra)
    return spots
def load_imzml_data_set(file):
    """

    FLAG=0: SEND TO CSV, RETURN NOTHING
    FLAG=1: RETURN DICT OF DATAFRAMES
    FLAG=2: SEND TO CSV, RETURN DICT OF DATAFRAMES

    :param file:
    :param flag:
    :return:
    """
    imzml_data_path = os.path.join(data_path_imzml, file)
    p = ImzMLParser(imzml_data_path)
    mass_data = {}
    intensity_data = {}
    x_cord, y_cord = p.coordinates[-1][0], p.coordinates[-1][1]
    for idx, (x, y, z) in enumerate(p.coordinates):
        # mzs are masses over charge of 1 ion
        # intensities correspond to the abundance of the particular ion
        mzs, intensities = p.getspectrum(idx)
        mass_data[idx] = mzs
        intensity_data[idx] = intensities

    # CONVERT DICTS TO DATA FRAMES
    df_mass_data = pd.DataFrame(mass_data)
    df_intensity_data = pd.DataFrame(intensity_data)
    f_name = file.split('.')[0]

    return {"mass": df_mass_data, "intensity": df_intensity_data, "x": x_cord, "y":  y_cord, "f_name": f_name}
def write_corrected_msi(msi, output_file, tolerance, database_exactmass, step,
                        dalim):
    # iterate throug each pixel of an MSI
    with ImzMLWriter(output_file) as w:
        p = ImzMLParser(msi, parse_lib='ElementTree')
        for idx, (x, y, z) in enumerate(p.coordinates):

            ms_mzs, ms_intensities = p.getspectrum(idx)
            peaks_ind = peak_selection(ms_intensities)
            peaks_mz = ms_mzs[peaks_ind]

            if len(peaks_mz) > 30:
                hit_exp, hit_errors = hits_generation(peaks_mz,
                                                      database_exactmass,
                                                      tolerance)
                if len(hit_errors) > 10:
                    roi = hits_selection(hit_errors,
                                         step,
                                         tolerance,
                                         da_limit=dalim)
                    if np.sum(roi) > 10:
                        mz_error_model = create_lm(hit_exp,
                                                   hit_errors,
                                                   tolerance=tolerance,
                                                   da_limit=dalim,
                                                   step=step)
                        if mz_error_model:
                            corrected_mzs = correct_mz_lm(
                                ms_mzs, mz_error_model)
                            w.addSpectrum(corrected_mzs, ms_intensities,
                                          (x, y, z))
Exemplo n.º 6
0
    def run(self):
        from pyimzml.ImzMLParser import ImzMLParser
        import json
        n_peaks = []
        s_min = []
        s_max = []
        s_ptp = []
        pcts = [5, 25, 50, 75, 95]
        s_pcts = []
        p = ImzMLParser(self.imzml_filename)
        for i, (x, y, z_) in enumerate(p.coordinates):
            mzs, ints = p.getspectrum(i)
            n_peaks.append(len(mzs))
            s_min.append(np.min(ints))
            s_max.append(np.max(ints))
            s_ptp.append(np.ptp(ints))
            s_pcts.append(list(np.percentile(ints, pcts)))

        stats = {
            'n_peaks': n_peaks,
            's_min': s_min,
            's_max': s_max,
            's_ptp': s_ptp,
            's_pcts': s_pcts
        }
        with open(self.output().path, 'w+') as f:
            json.dump(stats, f)
        print 'wrote spec stats'
Exemplo n.º 7
0
class FSImzMLReader(ImzMLReader):
    def __init__(self, path: Path):
        self.filename = find_file_by_ext(path, 'imzml')
        try:
            self._imzml_parser = ImzMLParser(
                self.filename,
                parse_lib='ElementTree',
                include_spectra_metadata=METADATA_FIELDS,
            )
        except Exception as e:
            raise ImzMLError(format_exc()) from e

        super().__init__(self._imzml_parser)

    def iter_spectra(self, sp_idxs: Sequence[int]):
        for sp_idx in sp_idxs:
            mzs, ints = self._imzml_parser.getspectrum(sp_idx)
            assert len(mzs) == self._imzml_parser.mzLengths[
                sp_idx], 'Incomplete .ibd file'
            assert len(ints) == self._imzml_parser.intensityLengths[
                sp_idx], 'Incomplete .ibd file'
            assert len(mzs) == len(
                ints), f"Spectrum {sp_idx} mz and intensity counts don't match"
            sp_idx, mzs, ints = self._process_spectrum(sp_idx, mzs, ints)
            yield sp_idx, mzs, ints
Exemplo n.º 8
0
    def __read_all(self, filename):
        """
        Internal helper function used to read all data. The
        function directly modifies the self.data entry.  Data is now a list of datacubes.
        """

        self.data = np.zeros(shape=self.shape, dtype=self.data_type)
        log_helper.info(__name__, 'Datacube shape is %s' % [self.data.shape])
        reader = ImzMLParser(filename)
        log_helper.debug(__name__, 'READING ALL DATA!! GIVE ME RAM (please)!')

        # Compute the bin edges for reinterpolation if needed
        if self.imzml_type == self.available_imzml_types['processed']:
            shift = np.diff(self.mz).mean()
            bin_edges = np.append(self.mz, self.mz[-1] + shift)
        else:
            bin_edges = None
        for ind in xrange(0, len(reader.coordinates)):
            xidx, yidx = reader.coordinates[ind]
            # Coordinates may start at arbitrary locations, hence, we need to substract the minimum to recenter at (0,0)
            xidx -= self.x_pos_min
            yidx -= self.y_pos_min
            # Read the spectrum
            mz, intens = reader.getspectrum(ind)
            # Reinterpolate intensities if we are in processed mode
            if bin_edges is not None:
                intens, bin_edges_new = np.histogram(mz,
                                                     bins=bin_edges,
                                                     weights=intens)
            # Save the intensity values in our data cube
            self.data[xidx, yidx, :] = intens
Exemplo n.º 9
0
    def __read_all(self, filename):
        """
        Internal helper function used to read all data. The
        function directly modifies the self.data entry.  Data is now a list of datacubes.
        """

        self.data = np.zeros(shape=self.shape, dtype=self.data_type)
        log_helper.info(__name__, 'Datacube shape is %s' % [self.data.shape])
        reader = ImzMLParser(filename)
        log_helper.debug(__name__,'READING ALL DATA!! GIVE ME RAM (please)!')

        # Compute the bin edges for reinterpolation if needed
        if self.imzml_type == self.available_imzml_types['processed']:
            shift = np.diff(self.mz).mean()
            bin_edges = np.append(self.mz, self.mz[-1]+ shift)
        else:
            bin_edges = None
        for ind in xrange(0, len(reader.coordinates)):
            xidx, yidx = reader.coordinates[ind]
            # Coordinates may start at arbitrary locations, hence, we need to substract the minimum to recenter at (0,0)
            xidx -= self.x_pos_min
            yidx -= self.y_pos_min
            # Read the spectrum
            mz, intens = reader.getspectrum(ind)
            # Reinterpolate intensities if we are in processed mode
            if bin_edges is not None:
                f = interpolate.interp1d(mz,intens,fill_value=0,bounds_error=False)
                intens = f(self.mz)
                #intens, bin_edges_new = np.histogram(mz, bins=bin_edges, weights=intens)
            # Save the intensity values in our data cube
            self.data[xidx, yidx, :] = intens
Exemplo n.º 10
0
    def spectrum_iter(self):
        """
        Generator function that yields a position and associated spectrum for a selected datacube type.
        :yield: (xidx, yidx) a tuple of ints representing x and y position in the image
        :yield: yi,          a numpy 1D-array of floats containing spectral intensities at the given position
                                and for the selected datacube type
        """
        reader = ImzMLParser(self.basename)
        for idx in xrange(0, len(reader.coordinates)):
            xidx, yidx, zidx = reader.coordinates[idx]
            # Coordinates may start at arbitrary locations, hence, we need to substract the minimum to recenter at (0,0)
            xidx -= self.x_pos_min
            yidx -= self.y_pos_min
            mz, intens = reader.getspectrum(idx)
            # Rehistogram the data if we are in procesed mode
            if self.imzml_type == self.available_imzml_types['processed']:
                # shift = np.diff(self.mz).mean()
                # bin_edges = np.append(self.mz, self.mz[-1]+ shift)
                f = interpolate.interp1d(mz,
                                         intens,
                                         fill_value=0,
                                         bounds_error=False)
                intens = f(self.mz)
                # intens, bin_edges_new = np.histogram(mz, bins=bin_edges, weights=intens)

            yield (xidx, yidx), np.asarray(intens)
Exemplo n.º 11
0
def get_spec(x, y1, y2, imzML_file):
    parser = ImzMLParser(imzML_file)
    part_map = dict()
    for y in range(y1, y2):
        try:
            idx = parser.coordinates.index((x, y, 1))
            spec_map = tupel2map(parser.getspectrum(idx))
            part_map[idx] = np.array(list(spec_map.values()))
        except:
            print(f"({x}, {y}, 1) is not in list.")
    return part_map
Exemplo n.º 12
0
def main(argv):
    from pyimzml.ImzMLParser import ImzMLParser
    inputfile = ''
    outputfile = ''
    try:
        opts, args = getopt.getopt(argv, "hi:o:", ["ifile=", "ofile="])
    except getopt.GetoptError:
        print('test.py -i <inputfile> -o <outputfile>')
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print('test.py -i <inputfile> -o <outputfile>')
            sys.exit()
        elif opt in ("-i", "--ifile"):
            inputfile = arg
        elif opt in ("-o", "--ofile"):
            outputfile = arg
    if inputfile == '':
        print('test.py -i <inputfile> -o <outputfile>')
        raise IOError('input file not specified')
    if outputfile == '':
        outputfile = inputfile + '.imzML'
    imzml = ImzMLParser(inputfile)
    spectra = []
    with ImzMLWriter(outputfile,
                     mz_dtype=np.float32,
                     intensity_dtype=np.float32) as writer:
        for i, coords in enumerate(imzml.coordinates):
            mzs, intensities = imzml.getspectrum(i)
            writer.addSpectrum(mzs, intensities, coords)
            spectra.append((mzs, intensities, coords))

    imzml = ImzMLParser(outputfile)
    spectra2 = []
    for i, coords in enumerate(imzml.coordinates):
        mzs, intensities = imzml.getspectrum(i)
        spectra2.append((mzs, intensities, coords))

    print(spectra[0] == spectra2[0])
Exemplo n.º 13
0
def import_imzml_dataset(filepath):
    """Reads an .imzml and stores    
    Returns:
    list:List of spectra    
    """
    p = ImzMLParser(filepath)
    
    spectra = []
    
    for idx, (x,y,z) in enumerate(p.coordinates):
        mzs, intensities = p.getspectrum(idx)
        spectra.append(spectrum(mzs, intensities, x, y, z))
        
    return spectra
Exemplo n.º 14
0
def main(argv):
    from pyimzml.ImzMLParser import ImzMLParser
    inputfile = ''
    outputfile = ''
    try:
        opts, args = getopt.getopt(argv,"hi:o:",["ifile=","ofile="])
    except getopt.GetoptError:
        print('test.py -i <inputfile> -o <outputfile>')
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print('test.py -i <inputfile> -o <outputfile>')
            sys.exit()
        elif opt in ("-i", "--ifile"):
            inputfile = arg
        elif opt in ("-o", "--ofile"):
            outputfile = arg
    if inputfile == '':
        print('test.py -i <inputfile> -o <outputfile>')
        raise IOError('input file not specified')
    if outputfile=='':
        outputfile=inputfile+'.imzML'
    imzml = ImzMLParser(inputfile)
    spectra = []
    with ImzMLWriter(outputfile, mz_dtype=np.float32, intensity_dtype=np.float32) as writer:
        for i, coords in enumerate(imzml.coordinates):
            mzs, intensities = imzml.getspectrum(i)
            writer.addSpectrum(mzs, intensities, coords)
            spectra.append((mzs, intensities, coords))

    imzml = ImzMLParser(outputfile)
    spectra2 = []
    for i, coords in enumerate(imzml.coordinates):
        mzs, intensities = imzml.getspectrum(i)
        spectra2.append((mzs, intensities, coords))

    print(spectra[0] == spectra2[0])
Exemplo n.º 15
0
class ImzmlDataset(BaseDataset):
    def __init__(self, filename):
        from pyimzml.ImzMLParser import ImzMLParser
        super(ImzmlDataset, self).__init__(filename)
        self.imzml = ImzMLParser(filename)
        self.coordinates = np.asarray(self.imzml.coordinates)
        self.step_size = [1, 1, 1]  #fixme get pixel size from header data

    def get_spectrum(self, ix):
        mzs, counts = self.imzml.getspectrum(ix)
        return [np.asarray(mzs), np.asarray(counts)]  #todo return MassSpectrum

    def get_image(self, mz, tol):
        im = self.imzml.getionimage(mz, tol)
        return im
def save_data_to_csv(filename):
    data_control_day_03 = os.path.join(data_path, filename)
    p = ImzMLParser(data_control_day_03)
    mass_data = {}
    intensity_data = {}
    for idx, (x, y, z) in enumerate(p.coordinates):
        # mzs are masses over charge of 1 ion
        # intensities correspond to the abundance of the particular ion
        mzs, intensities = p.getspectrum(idx)
        mass_data[idx] = mzs
        intensity_data[idx] = intensities
    df1 = pd.DataFrame(mass_data)
    df2 = pd.DataFrame(intensity_data)
    df1.to_csv('mass_data.csv')
    df2.to_csv('intensities.csv')
Exemplo n.º 17
0
def import_spectra(filepath, spectra_format="imzml"):
    ############### IMZML
    if spectra_format == "imzml" or spectra_format == "imzML":
        ##### Import the libraries
        install_required_packages("pyimzml")
        from pyimzml.ImzMLParser import ImzMLParser
        ##### Parse the imzML file
        parsed_imzml = ImzMLParser(filepath)
        ##### Generate the list of spectra
        spectra = []
        for i,(x,y) in enumerate(parsed_imzml.coordinates):
            spectra.append(parsed_imzml.getspectrum(i))
    ############### XMASS
    elif spectra_format == "brukerflex" or spectra_format == "xmass" or spectra_format == "Xmass":
        pass
    ############### Return the list of spectra
    return (spectra)
def save_data_to_csv(filename, type):
    data_control_day_03 = os.path.join(data_path, filename)
    p = ImzMLParser(data_control_day_03)
    mass_data = {}
    intensity_data = {}
    coords = {}
    for idx, (x, y, z) in enumerate(p.coordinates):
        # mzs are masses over charge of 1 ion
        # intensities correspond to the abundance of the particular ion
        mzs, intensities = p.getspectrum(idx)
        mass_data[idx] = mzs
        intensity_data[idx] = intensities
        coords[idx] = {"x": x, "y": y, "z": z}
    df1 = pd.DataFrame(mass_data)
    df2 = pd.DataFrame(intensity_data)
    df3 = pd.DataFrame.from_dict(coords, orient="index")
    df1.to_csv('csvData/mass_data_{type}.csv'.format(type=type))
    df2.to_csv('csvData/intensities_{type}.csv'.format(type=type))
    df3.to_csv('csvData/coords_{type}.csv'.format(type=type))
Exemplo n.º 19
0
 def run(self):
     from pyimzml.ImzMLParser import ImzMLParser
     import json
     p = ImzMLParser(self.imzml_filename)
     im = {}
     for im_type in self.im_types:
         im[im_type] = np.zeros((p.imzmldict["max count of pixels y"],
                                 p.imzmldict["max count of pixels x"]))
     for i, (x, y, z_) in enumerate(p.coordinates):
         mzs, ints = p.getspectrum(i)
         for im_type in self.im_types:
             im[im_type][y - 1, x - 1] = getattr(np, im_type)(ints)
     for ii, im_type in enumerate(self.im_types):
         result = {
             'im_vect': [_mz for _mz in im[im_type].flatten()],
             'im_shape': np.shape(im[im_type])
         }
         with open(self.output()[ii].path, 'w+') as f:
             json.dump(result, f)
Exemplo n.º 20
0
def get_spectra_df_from_parser(p: ImzMLParser, sp_idxs: Iterable[int]):
    peaks_dfs = []
    spectra = []

    for i in sp_idxs:
        mzs, ints = p.getspectrum(i)
        x, y, z = p.coordinates[i]
        mask = ints > 0
        mzs = mzs[mask].astype(np.float64)
        ints = ints[mask].astype(np.float32)
        peaks_dfs.append(pd.DataFrame({'sp': i, 'mz': mzs, 'ints': ints}))
        spectra.append((i, x, y, z, np.min(mzs), np.max(mzs), np.sum(ints)))

    peaks_df = pd.concat(peaks_dfs)
    spectra_df = pd.DataFrame(
        spectra, columns=['sp', 'x', 'y', 'z', 'mz_lo', 'mz_hi',
                          'tic']).set_index('sp')

    return peaks_df, spectra_df
Exemplo n.º 21
0
    def search_pixel(self, x: int, y: int) -> np.ndarray:
        start = time.time()
        log(start, f"pixel parsing imzml at {self.imzml_path}")
        p = ImzMLParser(self.imzml_path)
        n = 0
        coordinate_x = p.coordinates[n][0]
        coordinate_y = p.coordinates[n][1]

        if ((x, y, 1) in p.coordinates):
            n = p.coordinates.index((x, y, 1))
            coordinate_x = p.coordinates[n][0]
            coordinate_y = p.coordinates[n][1]

        mzs, ints = p.getspectrum(n)

        log(start, "done")
        return dict({
            'mzs': mzs.tolist(),
            'ints': ints.tolist(),
            'x': coordinate_x,
            'y': coordinate_y
        })
Exemplo n.º 22
0
def main(input_directory, output_directory, num_bins, input_kw=''):
    os.chdir(input_directory)
    files = [file for file in glob.glob("*.imzML") if input_kw in file]
    for f in files:
        print(f)
        p = ImzMLParser(f)
        shape = (p.imzmldict['max count of pixels x'], p.imzmldict['max count of pixels y'])
        spectrums = [p.getspectrum(i) for i in range(len(p.coordinates))]
        all_mzs,  all_intensities = zip(*spectrums)
        peaks, peak_intensities = [], []
        for i,intensities in enumerate(all_intensities):
            print(f'Getting Intensities: {i}/{len(all_intensities)}')
            t = signal.find_peaks(intensities, 50*1000)
            peaks.append(all_mzs[i][idxs_to_bool(t[0], len(intensities))])
            peak_intensities.append(t[1]['peak_heights'])
        number_of_bins = num_bins
        min_mzs = 450
        max_mzs = 1000
        bins = linspace(min_mzs, max_mzs, number_of_bins)
        col_set = list(range(len(p.coordinates) - 1))
        aggregated_df = pd.DataFrame(columns=list(bins))
        master_df = pd.DataFrame()
        for pixel, (peak_l, intensity_l) in enumerate(zip(peaks, peak_intensities)):
            print(f'Binning: {pixel}/{len(p.coordinates)}')
            curr_pixel = pd.DataFrame({'mzs': peak_l, 'intensities': intensity_l})
            pixel_binned = {}
            for index in range(1, len(bins)):
                lower_bound = bins[index - 1]
                upper_bound = bins[index]
                curr_bin = curr_pixel[curr_pixel['mzs'].between(lower_bound, upper_bound)]
                bin_intensity = curr_bin['intensities'].sum()
                pixel_binned[lower_bound] = bin_intensity
            temp = pd.DataFrame([pixel_binned])
            aggregated_df = aggregated_df.append(temp)

        data_name = path.splitext(f)[0]
        outfile = path.join(output_directory,data_name)
        aggregated_df.to_csv(f'{outfile}_{shape[0]}x{shape[1]}_aggregated.csv')
Exemplo n.º 23
0
    def spectrum_iter(self):
        """
        Generator function that yields a position and associated spectrum for a selected datacube type.
        :yield: (xidx, yidx) a tuple of ints representing x and y position in the image
        :yield: yi,          a numpy 1D-array of floats containing spectral intensities at the given position
                                and for the selected datacube type
        """
        reader = ImzMLParser(self.basename)
        for idx in xrange(0, len(reader.coordinates)):
            xidx, yidx, zidx = reader.coordinates[idx]
            # Coordinates may start at arbitrary locations, hence, we need to substract the minimum to recenter at (0,0)
            xidx -= self.x_pos_min
            yidx -= self.y_pos_min
            mz, intens = reader.getspectrum(idx)
            # Rehistogram the data if we are in procesed mode
            if self.imzml_type == self.available_imzml_types['processed']:
                # shift = np.diff(self.mz).mean()
                # bin_edges = np.append(self.mz, self.mz[-1]+ shift)
                f = interpolate.interp1d(mz,intens,fill_value=0,bounds_error=False)
                intens = f(self.mz)
                # intens, bin_edges_new = np.histogram(mz, bins=bin_edges, weights=intens)

            yield (xidx, yidx), np.asarray(intens)
Exemplo n.º 24
0
class IMZMLExtract:
    def __init__(self, fname, specStart=0):
        #fname = "/mnt/d/dev/data/190724_AR_ZT1_Proteins/190724_AR_ZT1_Proteins_spectra.imzML"

        self.fname = fname
        self.parser = ImzMLParser(fname)
        self.dregions = None

        self.mzValues = self.parser.getspectrum(0)[0]

        self.specStart = specStart

        if self.specStart != 0:
            self.mzValues = self.mzValues[self.specStart:]
            print("WARNING: SPECTRA STARTING AT POSITION", self.specStart)

        self.find_regions()

    def get_region_ids(self):
        return [x for x in self.dregions]

    def get_spectrum(self, specid):
        spectra1 = self.parser.getspectrum(specid)[1]
        return spectra1

    def compare_spectra(self, specid1, specid2):

        spectra1 = self.parser.getspectrum(specid1)[1]
        spectra2 = self.parser.getspectrum(specid2)[1]

        ssum = 0.0
        len1 = 0.0
        len2 = 0.0

        assert (len(spectra1) == len(spectra2))

        for i in range(0, len(spectra1)):

            ssum += spectra1[i] * spectra2[i]
            len1 += spectra1[i] * spectra1[i]
            len2 += spectra2[i] * spectra2[i]

        len1 = math.sqrt(len1)
        len2 = math.sqrt(len2)

        return ssum / (len1 * len2)

    def get_mz_index(self, value):

        curIdxDist = 1000000
        curIdx = 0

        for idx, x in enumerate(self.mzValues):
            dist = abs(x - value)

            if dist < curIdxDist:
                curIdx = idx
                curIdxDist = dist

        return curIdx

    def get_region_spectra(self, regionid, back_spectrum=None):

        if not regionid in self.dregions:
            return None

        outspectra = {}

        for coord in self.dregions[regionid]:

            spectID = self.parser.coordinates.index(coord)

            if spectID == None or spectID < 0:
                print("Invalid coordinate", coord)
                continue

            cspec = self.parser.getspectrum(spectID)[1]
            cspec = cspec[self.specStart:]

            if len(cspec) == 0:
                print("0 spec")
                continue
            if back_spectrum:
                cspec = np.subtract(cspec, back_spectrum)
            cspec = cspec / np.max(cspec)
            cspec = cspec - np.min(cspec)
            outspectra[coord] = cspec

        return outspectra

    def get_region_range(self, regionid):

        allpixels = self.dregions[regionid]

        minx = min([x[0] for x in allpixels])
        maxx = max([x[0] for x in allpixels])

        miny = min([x[1] for x in allpixels])
        maxy = max([x[1] for x in allpixels])

        minz = min([x[2] for x in allpixels])
        maxz = max([x[2] for x in allpixels])

        spectraLength = 0
        for coord in self.dregions[regionid]:

            spectID = self.parser.coordinates.index(coord)

            if spectID == None or spectID < 0:
                print("Invalid coordinate", coord)
                continue

            splen = self.parser.mzLengths[spectID] - self.specStart

            spectraLength = max(spectraLength, splen)

        return (minx, maxx), (miny, maxy), (minz, maxz), spectraLength

    def get_region_shape(self, regionid):

        rr = self.get_region_range(regionid)
        xr, yr, zr, sc = rr

        imzeShape = [xr[1] - xr[0] + 1, yr[1] - yr[0] + 1]

        if zr[1] - zr[0] + 1 > 1:
            imzeShape.append(zr[1] - zr[0] + 1)

        imzeShape.append(sc)

        spectraShape = tuple(imzeShape)

        return spectraShape

    def get_region_array(self, regionid, back_spectrum=None):

        xr, yr, zr, sc = self.get_region_range(regionid)
        rs = self.get_region_shape(regionid)
        print(rs)

        sarray = np.zeros(rs, dtype=np.float32)

        coord2spec = self.get_region_spectra(regionid, back_spectrum)

        for coord in coord2spec:
            xpos = coord[0] - xr[0]
            ypos = coord[1] - yr[0]

            spectra = coord2spec[coord]

            if len(spectra) < sc:
                spectra = np.pad(spectra, ((0, 0), (0, sc - len(spectra))),
                                 mode='constant',
                                 constant_values=0)

            sarray[xpos, ypos, :] = spectra

        return sarray

    def find_regions(self):

        if os.path.isfile(self.fname + ".regions"):

            print("Opening regions file for", self.fname)

            with open(self.fname + ".regions", 'r') as fin:
                self.dregions = defaultdict(list)

                for line in fin:
                    line = line.strip().split("\t")

                    coords = [int(x) for x in line]

                    self.dregions[coords[3]].append(tuple(coords[0:3]))

            for regionid in self.dregions:

                allpixels = self.dregions[regionid]

                minx = min([x[0] for x in allpixels])
                maxx = max([x[0] for x in allpixels])

                miny = min([x[1] for x in allpixels])
                maxy = max([x[1] for x in allpixels])

        else:

            self.dregions = self.__detectRegions(self.parser.coordinates)

            with open(self.fname + ".regions", 'w') as outfn:

                for regionid in self.dregions:

                    for pixel in self.dregions[regionid]:

                        print("\t".join([str(x) for x in pixel]),
                              regionid,
                              sep="\t",
                              file=outfn)

    def __dist(self, x, y):

        assert (len(x) == len(y))

        dist = 0
        for pidx in range(0, len(x)):

            dist += abs(x[pidx] - y[pidx])

        return dist

    def __detectRegions(self, allpixels):

        allregions = []

        for idx, pixel in enumerate(allpixels):

            if len(allregions) == 0:
                allregions.append([pixel])
                continue

            if idx % 1000 == 0:
                print("At pixel", idx, "of", len(allpixels), "with",
                      len(allregions), "regions")

            accRegions = []

            for ridx, region in enumerate(allregions):

                for coord in region:
                    if self.__dist(coord, pixel) <= 1:
                        accRegions.append(ridx)
                        break

            if len(accRegions) == 0:
                allregions.append([pixel])

            elif len(accRegions) == 1:

                for ridx in accRegions:
                    allregions[ridx].append(pixel)

            elif len(accRegions) > 1:

                bc = len(allregions)

                totalRegion = []
                for ridx in accRegions:
                    totalRegion += allregions[ridx]

                for ridx in sorted(accRegions, reverse=True):
                    del allregions[ridx]

                allregions.append(totalRegion)

                ac = len(allregions)

                assert (ac == bc + 1 - len(accRegions))

        outregions = {}

        for i in range(0, len(allregions)):
            outregions[i] = [tuple(x) for x in allregions[i]]

        return outregions

    def avg_background(self, background_id):
        xs = (self.get_region_range(background_id)[0][0],
              self.get_region_range(background_id)[0][1])
        ys = (self.get_region_range(background_id)[1][0],
              self.get_region_range(background_id)[1][1])

        mz2intens = {}

        for x in range(xs[0], xs[1]):
            for y in range(ys[0], ys[1]):
                try:
                    idx = self.parser.coordinates.index((x, y, 1))
                    tupl = self.parser.getspectrum(idx)
                    sp = dict(zip(tupl[0], tupl[1]))
                    for key in sp:
                        if key in mz2intens:
                            mz2intens[key].append(sp[key])
                        else:
                            mz2intens[key] = list()
                            mz2intens[key].append(sp[key])
                except:
                    print(f"({x}, {y}, 1) is not in list.")

        mz2avg = {}
        for key in mz2intens:
            mz2avg[key] = sum(mz2intens[key]) / len(mz2intens[key])
        return list(mz2avg.values())
Exemplo n.º 25
0
    def __compute_file_info(cls, filename, resolution):
        ## TODO completely refactor this to make it smartly handle profile or centroid datasets
        ## TODO: centroid datasets should take in a user parameter "Resolution" and resample data at that resolution
        ## TODO: profile datasets should work as is
        ## TODO: checks for profile data vs. centroid data on the variation in length of ['m/z array']
        """
        Internal helper function used to compute the mz axis, data type for the intensities, format type

        :return: Numpy array with mz axis
        :return: string with data type
        :return: imzml file type
        :return:
        """
        reader = ImzMLParser(filename)
        # Read the first spectrum
        mz_axes, intens = reader.getspectrum(0)  # NOTE: mz_axes is a tuple
        # Read the coordinates
        coordinates = np.asarray(reader.coordinates)
        # Determine the data type for the internsity values
        dtype = np.asarray(intens).dtype.str

        # Compute the mz axis and file type
        file_type = cls.available_imzml_types['continuous']
        min_mz, max_mz = np.amin(mz_axes), np.amax(mz_axes)
        for ind in range(coordinates.shape[0]
                         ):  #for ind, loc in enumerate(reader.coordinates):
            mz, intens = reader.getspectrum(ind)
            if mz == mz_axes:
                pass
            else:
                file_type = cls.available_imzml_types['processed']
                if min_mz > np.amin(mz):
                    min_mz = np.amin(mz)
                if max_mz < np.amax(mz):
                    max_mz = np.amax(mz)
        # Reinterpolate the mz-axis if we have a processed mode imzml file
        if file_type == cls.available_imzml_types['processed']:
            f = np.ceil(1e6 * np.log(max_mz / min_mz) / resolution)
            mz_axes = np.logspace(np.log10(min_mz), np.log10(max_mz), f)
            log_helper.info(
                __name__, "Reinterpolated m/z axis for processed imzML file")

        # Construct the imzml metadata information
        dataset_metadata = metadata_dict()
        instrument_metadata = metadata_dict()
        method_metadata = metadata_dict()
        for k, v in reader.imzmldict.iteritems():
            dataset_metadata[k] = metadata_value(name=k,
                                                 value=v,
                                                 unit=None,
                                                 description=k,
                                                 ontology=None)

        # Delete the parser and read the metadata
        del reader

        # Parse the metadata for the file. We try to parse only the header and ignore the
        # <run > group in the XML file to avoid going throught the whole file again
        # while extracting the majority of the relevant metadata
        try:
            with open(filename, 'r') as ins:
                metdata_header = ''
                for line in ins:
                    if '<run' in line:
                        break
                    else:
                        metdata_header += line
                metdata_header += '</mzML>'
                metdata_header_dict = xmltodict.parse(metdata_header)['mzML']
                for k, v in metdata_header_dict.iteritems():
                    store_value = metadata_value(
                        name=k,
                        value=v,
                        unit=None,
                        description=str(k) +
                        " extracted from imzML XML header.",
                        ontology=None)
                    if k == 'instrumentConfigurationList':
                        instrument_metadata[k] = store_value
                    elif k == 'dataProcessingList':
                        method_metadata[k] = store_value
                    elif k == 'scanSettingsList':
                        dataset_metadata[k] = store_value
                    elif k == 'softwareList':
                        method_metadata[k] = store_value
                    elif k == 'sampleList':
                        method_metadata[k] = store_value
                    else:
                        dataset_metadata[k] = store_value
                dataset_metadata['imzml_xml_metadata_header'] = metadata_value(
                    name='imzml_xml_metadata_header',
                    value=metdata_header,
                    unit=None,
                    description='XML imzML header',
                    ontology=None)
        except:
            log_helper.warning(
                __name__, "Extraction of additional imzML metadata failed")

        return coordinates, np.asarray(
            mz_axes
        ), dtype, file_type, dataset_metadata, instrument_metadata, method_metadata
Exemplo n.º 26
0
#%%

# # Obtain data
# !mkdir msi_data
# !wget -O msi_data/test_POS.imzML https://www.ebi.ac.uk/metabolights/ws/studies/MTBLS487/download/9c756a3f-2c96-4449-8dd7-d64540df5c6c\?file\=test_POS.imzML
# !wget wget -O msi_data/test_POS.ibd https://www.ebi.ac.uk/metabolights/ws/studies/MTBLS487/download/9c756a3f-2c96-4449-8dd7-d64540df5c6c\?file\=test_POS.ibd

#%%
# Parse data
p = ImzMLParser('msi_data/test_POS.imzML')
dimensions = (max(coor[0] for coor in p.coordinates),
              max(coor[1] for coor in p.coordinates))
# We know that z has only one value: 1
picture = [[None for y in range(dimensions[1])] for x in range(dimensions[0])]
for idx, (x, y, z) in enumerate(p.coordinates):
    mzs, intensities = p.getspectrum(idx)
    s = Spectrum(confs=list(zip(mzs, intensities)), label=str(x - 1) + ", " + str(y - 1))
    # remove peptide artifacts
    s.confs = [x for x in s.confs if x[0] < 1000]
    picture[x - 1][y - 1] = s

#%%
# Apply peak-picking procedure
for row in picture:
    for spectrum in row:
        spectrum.confs = spectrum.find_peaks()
        spectrum.confs = spectrum.centroid(0.5)

#%%
from MasSpOT import perform_clusterization
label_picture = perform_clusterization(picture, dimensions)
Exemplo n.º 27
0
def get_consensus(cluster_id,
                  matrix,
                  dist_dot_product,
                  ids,
                  imzMLfile,
                  xs,
                  ys,
                  plots=False):
    parser = ImzMLParser(imzMLfile)

    cluster_ids = get_cluster_elements(cluster_id, matrix, parser, xs, ys)
    cluster_matrix_ids = [ids.index(elem) for elem in cluster_ids]
    if len(cluster_matrix_ids) == 1:
        return tupel2map(parser.getspectrum(cluster_matrix_ids[0]))

    distance = np.zeros((len(cluster_matrix_ids), len(cluster_matrix_ids)))

    for i in range(len(cluster_matrix_ids)):
        for j in range(len(cluster_matrix_ids)):
            distance[i,
                     j] = distance[j,
                                   i] = dist_dot_product[cluster_matrix_ids[i],
                                                         cluster_matrix_ids[j]]
    print(distance.shape)
    np.fill_diagonal(distance, 0)
    Z = linkage(squareform(distance), method='average', metric='cosine')
    c = fcluster(Z, t=0, criterion='distance')

    order = [
        x for _, x in sorted(zip(c, range(len(cluster_matrix_ids))),
                             key=lambda pair: pair[0])
    ]

    new_spectum = {}
    spectra_list = list()
    for i in range(len(cluster_matrix_ids) - 1):
        if i == 0:
            new_spectum = average_spectra(
                tupel2map(parser.getspectrum(cluster_ids[i])),
                tupel2map(parser.getspectrum(cluster_ids[i + 1])))
        else:
            left = distance[i - 1, i]
            right = distance[i, i + 1]
            if left > right:
                new_spectum = average_spectra(
                    new_spectum, tupel2map(parser.getspectrum(cluster_ids[i])))
            else:
                spectra_list.append(new_spectum)
                new_spectum = average_spectra(
                    tupel2map(parser.getspectrum(cluster_ids[i])),
                    tupel2map(parser.getspectrum(cluster_ids[i + 1])))

    if not spectra_list:
        spectra_list.append(new_spectum)
        consensus = spectra_list[0]
        for spect in spectra_list:
            consensus = average_spectra(consensus, spect)
    else:
        consensus = new_spectum
    if plots:
        plt.figure()
        for i in cluster_ids:
            spectrum = tupel2map(parser.getspectrum(i))
            lists = spectrum.items()
            x, y = zip(*lists)  # unpack a list of pairs into two tuples
            plt.plot(x, y / max(y), label="Spectral ID {}".format(i))

        lists = consensus.items()
        x, y = zip(*lists)  # unpack a list of pairs into two tuples
        plt.plot(x, y / max(y), label="Consensus", c='black')

        plt.xlabel("m/z", fontsize=20)
        plt.ylabel("Intensity (normalized by maximum internsity)", fontsize=20)
        plt.legend(fontsize=20)
        plt.xticks(fontsize=20)
        plt.yticks(fontsize=20)
        plt.show()

    return consensus
Exemplo n.º 28
0
    def write_msi_to_hdf(self, h5f_fname_w_path, imzml_fname_w_path, norm=''):
        """
        Converts imzML data to HDF5 format. Iterates through imzML and, one 
        spectrum at a time, reads and writes spectral data to h5 as new raw
        dataset. Spectra are stored to HDF5 dataset in the order in which
        they are read from imzML file.
        """
        
        imzml = ImzMLParser(imzml_fname_w_path)
        #Infer dataset dimensions from first spectrum in imzML
        samp_mz, samp_int = imzml.getspectrum(0)
        length = len(samp_mz)
        self.mzsaxis = np.asarray(samp_mz)
        height = len(imzml.mzOffsets)
        del samp_int, samp_mz
        
        #Create and open hdf5 file
        hf = h5py.File(h5f_fname_w_path, 'w')
        print('\n%i spectra x %i mz data points'%(height, length))
        print()
        print('Importing...')
        print('')
        grp = hf.create_group('MALDI_001')

        dset1 = grp.create_dataset('Intensities', shape=(height, length),
                                   chunks=(1, 1024), dtype='float64')
        grp.create_dataset('m_over_z', data=self.mzsaxis)
        dset3 = grp.create_dataset('Coordinates', shape=(height,2))
        grp.create_group('Normalization Factors')
        
        # Iterates through spectra contained in .ibd binary using getspectrum()
        # For each spectrum, writes mz axis and intensity list as line in
        # appropriate datasets in group "MALDI_001"
        summed_ints = np.zeros(length)
        max_int = 0
        for i in range(height):
            mz, intensity = imzml.getspectrum(i)
            try:
                coordinates = imzml.get_physical_coordinates(i)
            except KeyError:
                imzml.imzmldict['pixel size y'] = imzml.imzmldict['pixel size x']
                coordinates = imzml.get_physical_coordinates(i)
            point = np.asarray([coordinates[0], coordinates[1]])
            dset1[i] = intensity
            dset3[i] = point
            if max(intensity) > max_int:
                max_int = max(intensity)
            summed_ints = summed_ints + intensity
            if self.log and i > 0:
                if i % 1000 == 0:
                    print('%i / %i'%((i, height)))
        average_spectrum = summed_ints / height
        grp.create_dataset('Average spectrum', data=(average_spectrum), dtype='float64')
        del summed_ints, mz, intensity, coordinates, point, max_int
        
        #Revert Numpy error handling to default setting (print)
        np.seterr(all = 'print')
        # Clean up, flush buffer, close file
        print('Finished importing!')
        if norm=='' or norm.upper()=='NONE':
            pass
        else:
            self._calculate_new_normalization_(norm, hf)
        hf.flush()
        hf.close()
        return
Exemplo n.º 29
0
class DefectFilter:
    def __init__(self, filename):
        """ Initialize Filtering Framework from an imzml file """
        self.spectrum = ImzMLParser(filename)
        self.mzlist = []
        self.intensity_list = []
        self.filename = []
        self.filter_spec_mass = np.zeros(np.shape(self.mzlist))
        self.filter_spec_intens = np.zeros(np.shape(self.intensity_list))

        for idx, (x, y, z) in enumerate(self.spectrum.coordinates):
            self.mzs, self.intensities = self.spectrum.getspectrum(idx)
            self.mzlist.append(self.mzs)
            self.intensity_list.append(self.intensities)

    def MSIFilter(self, coi, alpha):
        "Filter imzML file for complex of interest"
        if coi == "N-Glycan":
            self.glycanFilter()
            truefiltertime = time.time()
            self.filterIntens(self.intensity_list, self.mzlist)
            truefilterend = time.time()
            print("Removal of 0 values: " +
                  str(truefilterend - truefiltertime))
            self.glycan_intens = []
            for i in range(len(self.filtered_intens)):
                kendricktime = time.time()
                self.kendrickMass(self.filtered_mzs[i])
                kendrickend = time.time()
                print("KMD Algorithm Time: " + str(kendrickend - kendricktime))
                filtertime = time.time()
                probFilter = self.glycanProb(self.KM, self.KMD, alpha,
                                             self.filtered_intens[i])
                filterend = time.time()
                print("Prob Time: " + str(filterend - filtertime))
                self.glycan_intens.append(probFilter)

            outname = "Filtered_mz_" + str(np.random.randint(100000))
            with ImzMLWriter(outname) as w:
                for i in range(len(self.filtered_mzs)):
                    w.addSpectrum(self.filtered_mzs[i], self.glycan_intens[i],
                                  self.spectrum.coordinates[i])
            print("File Written to : " + outname)

    def glycanFilter(self, max_defect=3):
        """create a line for the glycan filter based on ASMS 2019 poster """
        self.glycanMD = self.mzs * 3.5 * 10**(-4) + 0.0039
        self.glycanDict = {}
        self.glycanSigma = 0.0173
        for i in range(len(self.mzs)):
            self.glycanDict[self.mzs[i]] = self.glycanMD[i]

    def glycanProb(self, KM, KMD, alpha, intensities, dist='Norm'):
        # Replace gylcanProb with a t-test or z-test from software
        """ Provide an intensity spectrum filtered for KMD values within alpha of known values """
        """ for single spectrum """
        glycanFilterInt = intensities.copy()
        for i in range(len(KM)):
            xbar = self.glycanDict[self.KM[i]]
            bestProb = 1
            for j in KMD[i]:
                if dist == 'Norm':
                    prob = st.norm.cdf(
                        abs(xbar - j), loc=0, scale=self.glycanSigma) - 0.5
                else:
                    break
                if prob < bestProb:
                    bestProb = prob
            if bestProb > alpha:
                glycanFilterInt[i] = 0
        return glycanFilterInt

    def kendrickMass(self, mzs, max_defect=3):
        """ for single spectrum """
        # Start with KMs between 0 and 1:
        self.KMdict = {}

        for mz in mzs:
            self.KMdict[mz] = []
            defect, mass = np.modf(mz)

            for i in range(max_defect + 1):
                if mz - i in self.KMdict.keys():
                    self.KMdict[mz - i].append(defect + i)
                else:
                    continue

        self.KM = list(self.KMdict.keys())
        self.KMD = list(self.KMdict.values())

    def kendrickMassList(self, mzs):
        """ for single spectrum """
        KM = mzs * 14 / 14.01565
        self.KM2.append(KM)
        KMD = np.floor(KM) - KM
        self.KMD2.append(KMD)

    def KMDplot(self):
        axes = plt.axes()
        axes.set_ylim([-1, 0])
        for i in range(len(self.filtered_mass)):
            plt.scatter(self.filtered_mass[i], self.KMD2[i])
        plt.show()

    def filterIntens(self, intens_list, mzlist, thresh=0):
        print("iteration")
        self.filtered_intens = []
        self.filtered_mzs = []
        self.filter_idx = []
        for i in range(len(intens_list)):
            intens = []
            mzs = []
            idx = []
            j = 0
            if np.all(intens_list[i] <= thresh):
                continue
            else:
                for k in range(len(intens_list[i])):
                    if intens_list[i][k] > thresh:
                        intens.append(intens_list[i][k])
                        mzs.append(mzlist[i][k])
                        idx.append((i, j))
                    j += 1
            self.filtered_intens.append(intens)
            self.filtered_mzs.append(mzs)
            self.filter_idx.append(idx)

    def kendrickFilter(self, thresh, intens_list, mzlist):
        """ Takes full spectrum lists not single spectrum """
        for i in range(len(intens_list)):
            self.filterIntens(thresh, intens_list[i], mzlist[i])
        for i in self.filtered_mass:
            self.kendrickMassList(i)
Exemplo n.º 30
0
      imze.get_region_range(region)[0][1] + 1)
ys = (imze.get_region_range(region)[1][0],
      imze.get_region_range(region)[1][1] + 1)


def tupel2map(spec):
    return dict(zip(spec[0], spec[1]))


mz2intens = {}
print('Calculating pixel map...')
for x in range(xs[0], xs[1]):
    for y in range(ys[0], ys[1]):
        try:
            idx = parser.coordinates.index((x, y, 1))
            sp = tupel2map(parser.getspectrum(idx))
            for k in sp:
                if k in mz2intens:
                    mz2intens[k].append(sp[k])
                else:
                    mz2intens[k] = list()
                    mz2intens[k].append(sp[k])
        except:
            print(f"({x}, {y}, 1) is not in list.")

mz2avg = {}
for key in mz2intens:
    mz2avg[key] = sum(mz2intens[key]) / len(mz2intens[key])

if save:
    filename = imzMLfile + "." + str(region) + "_avg" + ".pickle"
Exemplo n.º 31
0
def imzml_to_hdf5(imzml_file_path, out_path, mir_path):

    dataset_name, _ = os.path.splitext(os.path.basename(imzml_file_path))

    print()
    print('Loading', imzml_file_path)
    p = ImzMLParser(imzml_file_path, parse_lib='ElementTree')
    print()
    print('Loading done!')

    # check if all spectra have the same mz axis
    num_spectra = len(p.mzLengths)
    mz_index = np.array(p.getspectrum(0)[0])
    mz_index_length = len(mz_index)
    print()
    print('m/z consistency check ...')

    # '0' = mz values, '1' = intensities
    mz_index = np.unique(
        np.concatenate([p.getspectrum(i)[0] for i in range(num_spectra)]))

    if len(mz_index) != mz_index_length:
        print(
            'WARNING: Not all spectra have the same mz values. Missing values are filled with zeros!'
        )

    print()
    print('m/z consistency check done!')

    # DEV: use small range to test bigger datasets on little memory
    mz_selection = slice(None)  # range(100)
    # load all intensities into a single data frame
    # resulting format:
    #   1 row = 1 spectrum
    #   1 column = all intensities for 1 mz, that is all values for a single intensity image
    print()
    print('DataFrame creation ...')
    msi_frame = pd.DataFrame(intensities_generator(p, mz_index, mz_selection),
                             columns=mz_index[mz_selection])
    print('DataFrame creation done')
    print()
    print("DataFrame size equals: %i pixels, %i mz-values" % msi_frame.shape)
    print()

    if mir_path:
        print()
        print('Peak picking ...')
        msi_frame = select_peaks_from_msi_frame(msi_frame, mir_path)
        print()
        print('Peak picking done!')

    msi_frame = msi_frame.fillna(0)

    xycoordinates = np.asarray(p.coordinates)[:, [0, 1]]
    multi_index = pd.MultiIndex.from_arrays(xycoordinates.T,
                                            names=("grid_x", "grid_y"))
    msi_frame.set_index(multi_index, inplace=True)

    msi_frame["dataset"] = [dataset_name] * msi_frame.shape[0]
    msi_frame = msi_frame.set_index("dataset", append=True)

    # For some data sets a small fraction of intensities (~0.1%) have been
    # negative, this might be a numerical issue in the imzml export by bruker.
    # DEV ad-hoc fix (couldn't figure out the cause or a more reasonable fix so far)
    msi_frame[msi_frame < 0] = 0

    print()
    print('Write DataFrame ...')
    h5_store_path = os.path.join(out_path, dataset_name + '.h5')
    save_name_frame = 'msi_frame_' + dataset_name
    with pd.HDFStore(h5_store_path, complib='blosc', complevel=9) as store:
        store[save_name_frame] = msi_frame
    print()
    print('done. Script completed!')
Exemplo n.º 32
0
class inMemoryIMS():
    def __init__(self,
                 filename,
                 min_mz=0.,
                 max_mz=np.inf,
                 min_int=0.,
                 index_range=[],
                 cache_spectra=True,
                 do_summary=True,
                 norm='none',
                 norm_args={},
                 spectrum_type='centroids'):
        file_size = os.path.getsize(filename)
        self.load_file(filename,
                       min_mz,
                       max_mz,
                       min_int,
                       index_range=index_range,
                       cache_spectra=cache_spectra,
                       do_summary=do_summary,
                       norm=norm,
                       norm_args=norm_args,
                       spectrum_type=spectrum_type)

    def load_file(self,
                  filename,
                  min_mz=0,
                  max_mz=np.inf,
                  min_int=0,
                  index_range=[],
                  cache_spectra=True,
                  do_summary=True,
                  norm=[],
                  norm_args={},
                  spectrum_type='centroids'):
        # parse file to get required parameters
        # can use thin hdf5 wrapper for getting data from file
        self.file_dir, self.filename = os.path.split(filename)
        self.filename, self.file_type = os.path.splitext(self.filename)
        self.file_type = self.file_type.lower()
        self.norm = norm.lower()
        self.norm_args = norm_args
        if self.file_type == '.hdf5':
            import h5py
            self.hdf = h5py.File(filename, 'r')  # Readonly, fie must exist
            if index_range == []:
                self.index_list = map(int, self.hdf['/spectral_data'].keys())
            else:
                self.index_list = index_range
        elif self.file_type == '.imzml':
            from pyimzml.ImzMLParser import ImzMLParser
            self.imzml = ImzMLParser(filename)
            self.index_list = range(0, len(self.imzml.coordinates))
        else:
            raise TypeError('File type not recogised: {}'.format(
                self.file_type))
        self.max_index = max(self.index_list)
        self.coords = self.get_coords()
        step_size = self.get_step_size()
        cube = ion_datacube(step_size=step_size)
        cube.add_coords(self.coords)
        self.cube_pixel_indices = cube.pixel_indices
        self.cube_n_row, self.cube_n_col = cube.nRows, cube.nColumns
        self.histogram_mz_axis = {}
        self.mz_min = 9999999999999.
        self.mz_max = 0.
        self.spectrum_type = spectrum_type  #todo this should be read for imzml files, not coded as an input
        if any([cache_spectra, do_summary]) == True:
            # load data into memory
            self.mz_list = []
            self.count_list = []
            self.idx_list = []
            if do_summary:
                self.mic = np.zeros((len(self.index_list), 1))
                self.tic = np.zeros((len(self.index_list), 1))
            for ii in self.index_list:
                # load spectrum, keep values gt0 (shouldn't be here anyway)
                this_spectrum = self.get_spectrum(ii)
                mzs, counts = this_spectrum.get_spectrum(source=spectrum_type)
                if len(mzs) != len(counts):
                    raise TypeError(
                        'length of mzs ({}) not equal to counts ({})'.format(
                            len(mzs), len(counts)))
                # Enforce data limits
                valid = np.where((mzs > min_mz) & (mzs < max_mz)
                                 & (counts > min_int))
                counts = counts[valid]
                mzs = mzs[valid]
                # record min/max

                if not len(mzs) == 0:
                    if mzs[0] < self.mz_min:
                        self.mz_min = mzs[0]
                    if mzs[-1] > self.mz_max:
                        self.mz_max = mzs[-1]
                    #record summary values
                    if do_summary:
                        self.tic[ii] = sum(counts)
                        self.mic[ii] = max(counts)
                # append ever-growing lists (should probably be preallocated or piped to disk and re-loaded)
                if cache_spectra:
                    self.mz_list.append(mzs)
                    self.count_list.append(counts)
                    self.idx_list.append(np.ones(len(mzs), dtype=int) * ii)

            print 'loaded spectra'
            if cache_spectra:
                self.mz_list = np.concatenate(self.mz_list)
                self.count_list = np.concatenate(self.count_list)
                self.idx_list = np.concatenate(self.idx_list)
                # sort by mz for fast image formation
                mz_order = np.argsort(self.mz_list)
                self.mz_list = self.mz_list[mz_order]
                self.count_list = self.count_list[mz_order]
                self.idx_list = self.idx_list[mz_order]
                # split binary searches into two stages for better locality
                self.window_size = 1024
                self.mz_sublist = self.mz_list[::self.window_size].copy()
        print 'file loaded'

    def get_step_size(self):
        if self.file_type == '.imzml':
            return [1, 1, 1]
        else:
            return []

    def get_coords(self):
        # wrapper for redirecting requests to correct parser
        if self.file_type == '.imzml':
            coords = self.get_coords_imzml()
            coords[:, [0, 1]] = coords[:, [1, 0]]
        elif self.file_type == '.hdf5':
            coords = self.get_coords_hdf5()
        return coords

    def get_coords_imzml(self):  # get real world coordinates
        print('TODO: convert indices into real world coordinates')
        coords = np.asarray(self.imzml.coordinates)
        if len(self.imzml.coordinates[0]) == 2:  #2D - append zero z-coord
            coords = np.concatenate((coords, np.zeros((len(coords), 1))),
                                    axis=1)
        return coords

    def get_coords_hdf5(self):
        coords = np.zeros((len(self.index_list), 3))
        for k in self.index_list:
            coords[k, :] = self.hdf['/spectral_data/' + str(k) +
                                    '/coordinates/']
        return coords

    def get_spectrum(self, index):
        # wrapper for redirecting requests to correct parser
        if self.file_type == '.imzml':
            this_spectrum = self.get_spectrum_imzml(index)
        elif self.file_type == '.hdf5':
            this_spectrum = self.get_spectrum_hdf5(index)
        if self.norm != []:
            this_spectrum.normalise_spectrum(method=self.norm,
                                             method_args=self.norm_args)
            #mzs,counts = this_spectrum.get_spectrum(source="centroids")
            #if self.norm == 'TIC':
            #    counts = counts / np.sum(counts)
            #elif self.norm == 'RMS':
            #    counts = counts / np.sqrt(np.mean(np.square(counts)))
            #elif self.norm == 'MAD':
            #    counts = counts/np.median(np.absolute(counts - np.mean(counts)))
            #this_spectrum.add_centroids(mzs,counts)
        return this_spectrum

    def get_spectrum_imzml(self, index):
        mzs, intensities = self.imzml.getspectrum(index)
        ## temp hack -> assume centroided
        this_spectrum = mass_spectrum()
        if self.spectrum_type == 'centroids':
            this_spectrum.add_centroids(mzs, intensities)
        else:
            this_spectrum.add_spectrum(mzs, intensities)
        return this_spectrum

    def get_spectrum_hdf5(self, index):
        import h5py
        this_spectrum = mass_spectrum()
        tmp_str = '/spectral_data/%d' % (index)
        try:
            this_spectrum.add_spectrum(self.hdf[tmp_str + '/mzs/'],
                                       self.hdf[tmp_str + '/intensities/'])
            got_spectrum = True
        except KeyError:
            got_spectrum = False
        try:
            this_spectrum.add_centroids(
                self.hdf[tmp_str + '/centroid_mzs/'],
                self.hdf[tmp_str + '/centroid_intensities/'])
            got_centroids = True
        except KeyError:
            got_centroids = False
        if not any([got_spectrum, got_centroids]):
            raise ValueError(
                'No spectral data found in index {}'.format(index))
        return this_spectrum

    def empty_datacube(self):
        data_out = ion_datacube()
        # add precomputed pixel indices
        data_out.coords = self.coords
        data_out.pixel_indices = self.cube_pixel_indices
        data_out.nRows = self.cube_n_row
        data_out.nColumns = self.cube_n_col
        return data_out

    def get_ion_image(self, mzs, tols, tol_type='ppm'):
        try:
            len(mzs)
        except TypeError as e:
            mzs = [
                mzs,
            ]
        try:
            len(tols)
        except TypeError as e:
            tols = [
                tols,
            ]
        mzs = np.asarray(mzs)
        tols = np.asarray(tols)
        data_out = self.empty_datacube()

        def search_sort(mzs, tols):
            data_out = blank_dataout()
            idx_left = np.searchsorted(self.mz_list, mzs - tols, 'l')
            idx_right = np.searchsorted(self.mz_list, mzs + tols, 'r')
            for mz, tol, il, ir in zip(mzs, tols, idx_left, idx_right):
                if any((mz < self.mz_list[0], mz > self.mz_list[-1])):
                    data_out.add_xic(
                        np.zeros(np.shape(self.cube_pixel_indices)), [mz],
                        [tol])
                    continue
                # slice list for code clarity
                mz_vect = self.mz_list[il:ir]
                idx_vect = self.idx_list[il:ir]
                count_vect = self.count_list[il:ir]
                # bin vectors
                ion_vect = np.bincount(idx_vect,
                                       weights=count_vect,
                                       minlength=self.max_index + 1)
                data_out.add_xic(ion_vect, [mz], [tol])
            return data_out

        def search_bisect(mzs, tols):
            data_out = blank_dataout()
            for mz, tol in zip(mzs, tols):
                if any((mz < self.mz_list[0], mz > self.mz_list[-1])):
                    data_out.add_xic(
                        np.zeros(np.shape(self.cube_pixel_indices)), [mz],
                        [tol])
                    continue
                mz_upper = mz + tol
                mz_lower = mz - tol
                il = bisect.bisect_left(self.mz_list, mz_lower)
                ir = bisect.bisect_right(self.mz_list, mz_upper)
                # slice list for code clarity
                mz_vect = self.mz_list[il:ir]
                idx_vect = self.idx_list[il:ir]
                count_vect = self.count_list[il:ir]
                # bin vectors
                ion_vect = np.bincount(idx_vect,
                                       weights=count_vect,
                                       minlength=self.max_index + 1)
                data_out.add_xic(ion_vect, [mz], [tol])
            return data_out

        if len(tols) == 1:
            tols = tols * np.ones(np.shape(mzs))
        if type(mzs) not in (np.ndarray, list):
            mzs = np.asarray([
                mzs,
            ])
        if tol_type == 'ppm':
            tols = tols * mzs / 1e6  # to m/z
        # Fast search for insertion point of mz in self.mz_list
        # First stage is looking for windows using the sublist
        idx_left = np.searchsorted(self.mz_sublist, mzs - tols, 'l')
        idx_right = np.searchsorted(self.mz_sublist, mzs + tols, 'r')
        for mz, tol, il, ir in zip(mzs, tols, idx_left, idx_right):
            l = max(il - 1, 0) * self.window_size
            r = ir * self.window_size
            # Second stage is binary search within the windows
            il = l + np.searchsorted(self.mz_list[l:r], mz - tol, 'l')
            ir = l + np.searchsorted(self.mz_list[l:r], mz + tol, 'r')
            # slice list for code clarity
            mz_vect = self.mz_list[il:ir]
            idx_vect = self.idx_list[il:ir]
            count_vect = self.count_list[il:ir]
            # bin vectors
            ion_vect = np.bincount(idx_vect,
                                   weights=count_vect,
                                   minlength=self.max_index + 1)
            data_out.add_xic(ion_vect, [mz], [tol])
        return data_out
        # Form histogram axis

    def generate_histogram_axis(self, ppm=1.):
        ppm_mult = ppm * 1e-6
        mz_current = self.mz_min
        mz_list = [
            mz_current,
        ]
        while mz_current <= self.mz_max:
            mz_current = mz_current + mz_current * ppm_mult
            mz_list.append(mz_current)
        self.histogram_mz_axis[ppm] = mz_list

    def get_histogram_axis(self, ppm=1.):
        try:
            mz_axis = self.histogram_mz_axis[ppm]
        except KeyError as e:
            print 'generating histogram axis for ppm {}'.format(ppm)
            self.generate_histogram_axis(ppm=ppm)
        return self.histogram_mz_axis[ppm]

    def generate_summary_spectrum(self,
                                  summary_type='mean',
                                  ppm=1.,
                                  hist_axis=[]):
        if hist_axis == []:
            hist_axis = self.get_histogram_axis(ppm=ppm)
        # calcualte mean along some m/z axis
        mean_spec = np.zeros(np.shape(hist_axis))
        for ii in range(0, len(hist_axis) - 1):
            mz_upper = hist_axis[ii + 1]
            mz_lower = hist_axis[ii]
            idx_left = bisect.bisect_left(self.mz_list, mz_lower)
            idx_right = bisect.bisect_right(self.mz_list, mz_upper)
            # slice list for code clarity
            count_vect = self.count_list[idx_left:idx_right]
            if summary_type == 'mean':
                count_vect = self.count_list[idx_left:idx_right]
                mean_spec[ii] = np.sum(count_vect)
            elif summary_type == 'freq':
                idx_vect = self.idx_list[idx_left:idx_right]
                mean_spec[ii] = float(len(np.unique(idx_vect)))
            else:
                raise ValueError(
                    'Summary type not recognised; {}'.format(summary_type))
        if summary_type == 'mean':
            mean_spec = mean_spec / len(self.index_list)
        elif summary_type == 'freq':
            mean_spec = mean_spec / len(self.index_list)
        return hist_axis, mean_spec

    def get_summary_image(self, summary_func='tic'):
        if summary_func not in ['tic', 'mic']:
            raise KeyError("requested type not in 'tic' mic'")
        #data_out = ion_datacube()
        # add precomputed pixel indices
        #data_out.coords = self.coords
        #data_out.pixel_indices = self.cube_pixel_indices
        #data_out.nRows = self.cube_n_row
        #data_out.nColumns = self.cube_n_col
        data_out = self.empty_datacube()
        data_out.add_xic(np.asarray(getattr(self, summary_func)), [0], [0])
        return data_out
Exemplo n.º 33
0
class inMemoryIMS():
    def __init__(self, filename, min_mz=0., max_mz=np.inf, min_int=0., index_range=[],cache_spectra=True,do_summary=True,norm=''):
        file_size = os.path.getsize(filename)
        self.load_file(filename, min_mz, max_mz, min_int, index_range=index_range,cache_spectra=cache_spectra,do_summary=do_summary,norm=norm)

    def load_file(self, filename, min_mz=0, max_mz=np.inf, min_int=0, index_range=[],cache_spectra=True,do_summary=True,norm=[]):
        # parse file to get required parameters
        # can use thin hdf5 wrapper for getting data from file
        self.file_dir, self.filename = os.path.split(filename)
        self.filename, self.file_type = os.path.splitext(self.filename)
        self.file_type = self.file_type.lower()
        self.norm=norm
        if self.file_type == '.hdf5':
            import h5py
            self.hdf = h5py.File(filename, 'r')  # Readonly, fie must exist
            if index_range == []:
                self.index_list = map(int, self.hdf['/spectral_data'].keys())
            else:
                self.index_list = index_range
        elif self.file_type == '.imzml':
            from pyimzml.ImzMLParser import ImzMLParser
            self.imzml = ImzMLParser(filename)
            self.index_list=range(0,len(self.imzml.coordinates))
        else:
            raise TypeError('File type not recogised: {}'.format(self.file_type))
        self.max_index = max(self.index_list)
        self.coords = self.get_coords()
        step_size = self.get_step_size()
        cube = ion_datacube(step_size=step_size)
        cube.add_coords(self.coords)
        self.cube_pixel_indices = cube.pixel_indices
        self.cube_n_row, self.cube_n_col = cube.nRows, cube.nColumns
        self.histogram_mz_axis = {}
        self.mz_min = 9999999999999.
        self.mz_max = 0.
        if any([cache_spectra,do_summary]) == True:
            # load data into memory
            self.mz_list = []
            self.count_list = []
            self.idx_list = []
            if do_summary:
                self.mic=np.zeros((len(self.index_list),1))
                self.tic=np.zeros((len(self.index_list),1))
            for ii in self.index_list:
                # load spectrum, keep values gt0 (shouldn't be here anyway)
                this_spectrum = self.get_spectrum(ii)
                mzs, counts = this_spectrum.get_spectrum(source='centroids')
                if len(mzs) != len(counts):
                    raise TypeError('length of mzs ({}) not equal to counts ({})'.format(len(mzs), len(counts)))
                # Enforce data limits
                valid = np.where((mzs > min_mz) & (mzs < max_mz) & (counts > min_int))
                counts = counts[valid]
                mzs = mzs[valid]
                # record min/max
                if mzs[0]<self.mz_min:
                    self.mz_min = mzs[0]
                if mzs[-1]>self.mz_max:
                    self.mz_max = mzs[-1]
                # append ever-growing lists (should probably be preallocated or piped to disk and re-loaded)
                if cache_spectra:
                    self.mz_list.append(mzs)
                    self.count_list.append(counts)
                    self.idx_list.append(np.ones(len(mzs), dtype=int) * ii)
                #record summary values
                if do_summary:
                    self.tic[ii]=sum(counts)
                    self.mic[ii]=max(counts)
            print 'loaded spectra'
            if cache_spectra:
                self.mz_list = np.concatenate(self.mz_list)
                self.count_list = np.concatenate(self.count_list)
                self.idx_list = np.concatenate(self.idx_list)
                # sort by mz for fast image formation
                mz_order = np.argsort(self.mz_list)
                self.mz_list = self.mz_list[mz_order]
                self.count_list = self.count_list[mz_order]
                self.idx_list = self.idx_list[mz_order]
                # split binary searches into two stages for better locality
                self.window_size = 1024
                self.mz_sublist = self.mz_list[::self.window_size].copy()
        print 'file loaded'


    def get_step_size(self):
        if self.file_type == '.imzml':
            return [1,1,1]
        else:
            return []


    def get_coords(self):
        # wrapper for redirecting requests to correct parser
        if self.file_type == '.imzml':
            coords = self.get_coords_imzml()
            coords[:,[0, 1]] = coords[:,[1, 0]]
        elif self.file_type == '.hdf5':
            coords = self.get_coords_hdf5()
        return coords


    def get_coords_imzml(self):# get real world coordinates
        print('TODO: convert indices into real world coordinates')
        coords = np.asarray(self.imzml.coordinates)
        if len(self.imzml.coordinates[0]) == 2: #2D - append zero z-coord
            coords = np.concatenate((coords,np.zeros((len(coords),1))),axis=1)
        return coords


    def get_coords_hdf5(self):
        coords = np.zeros((len(self.index_list), 3))
        for k in self.index_list:
            coords[k, :] = self.hdf['/spectral_data/' + str(k) + '/coordinates/']
        return coords


    def get_spectrum(self,index):
        # wrapper for redirecting requests to correct parser
        if self.file_type == '.imzml':
            this_spectrum = self.get_spectrum_imzml(index)
        elif self.file_type == '.hdf5':
            this_spectrum = self.get_spectrum_hdf5(index)
        if self.norm != []:
            mzs,counts = this_spectrum.get_spectrum(source="centroids")
            if self.norm == 'TIC':
                counts = counts / np.sum(counts)
            elif self.norm == 'RMS':
                counts = counts / np.sqrt(np.mean(np.square(counts)))
            elif self.norm == 'MAD':
                counts = counts/np.median(np.absolute(counts - np.mean(counts)))
            this_spectrum.add_centroids(mzs,counts)
        return this_spectrum


    def get_spectrum_imzml(self,index):
        mzs, intensities = self.imzml.getspectrum(index)
        ## temp hack -> assume centroided
        this_spectrum = mass_spectrum()
        this_spectrum.add_centroids(mzs,intensities)
        return this_spectrum

    def get_spectrum_hdf5(self, index):
        import h5py
        this_spectrum = mass_spectrum()
        tmp_str = '/spectral_data/%d' % (index)
        try:
            this_spectrum.add_spectrum(self.hdf[tmp_str + '/mzs/'], self.hdf[tmp_str + '/intensities/'])
            got_spectrum = True
        except KeyError:
            got_spectrum = False
        try:
            this_spectrum.add_centroids(self.hdf[tmp_str + '/centroid_mzs/'],
                                        self.hdf[tmp_str + '/centroid_intensities/'])
            got_centroids = True
        except KeyError:
            got_centroids = False
        if not any([got_spectrum, got_centroids]):
            raise ValueError('No spectral data found in index {}'.format(index))
        return this_spectrum

    def empty_datacube(self):
        data_out = ion_datacube()
        # add precomputed pixel indices
        data_out.coords = self.coords
        data_out.pixel_indices = self.cube_pixel_indices
        data_out.nRows = self.cube_n_row
        data_out.nColumns = self.cube_n_col
        return data_out

    def get_ion_image(self, mzs, tols, tol_type='ppm'):
        data_out = self.empty_datacube()

        def search_sort(mzs,tols):
            data_out = blank_dataout()
            idx_left = np.searchsorted(self.mz_list, mzs - tols, 'l')
            idx_right = np.searchsorted(self.mz_list, mzs + tols, 'r')
            for mz, tol, il, ir in zip(mzs, tols, idx_left, idx_right):
                if any((mz<self.mz_list[0],mz>self.mz_list[-1])):
                    data_out.add_xic(np.zeros(np.shape(self.cube_pixel_indices)), [mz], [tol])
                    continue
                # slice list for code clarity
                mz_vect=self.mz_list[il:ir]
                idx_vect = self.idx_list[il:ir]
                count_vect = self.count_list[il:ir]
                # bin vectors
                ion_vect = np.bincount(idx_vect, weights=count_vect, minlength=self.max_index + 1)
                data_out.add_xic(ion_vect, [mz], [tol])
            return data_out
        def search_bisect(mzs,tols):
            data_out = blank_dataout()
            for mz,tol in zip(mzs,tols):
                if any((mz<self.mz_list[0],mz>self.mz_list[-1])):
                    data_out.add_xic(np.zeros(np.shape(self.cube_pixel_indices)), [mz], [tol])
                    continue
                mz_upper = mz + tol
                mz_lower = mz - tol
                il = bisect.bisect_left(self.mz_list,mz_lower)
                ir = bisect.bisect_right(self.mz_list,mz_upper)
                # slice list for code clarity
                mz_vect=self.mz_list[il:ir]
                idx_vect = self.idx_list[il:ir]
                count_vect = self.count_list[il:ir]
                # bin vectors
                ion_vect = np.bincount(idx_vect, weights=count_vect, minlength=self.max_index + 1)
                data_out.add_xic(ion_vect, [mz], [tol])
            return data_out
        if type(mzs) not in (np.ndarray, list):
            mzs = np.asarray([mzs, ])
        if tol_type == 'ppm':
            tols = tols * mzs / 1e6  # to m/z

        # Fast search for insertion point of mz in self.mz_list
        # First stage is looking for windows using the sublist
        idx_left = np.searchsorted(self.mz_sublist, mzs - tols, 'l')
        idx_right = np.searchsorted(self.mz_sublist, mzs + tols, 'r')
        for mz, tol, il, ir in zip(mzs, tols, idx_left, idx_right):
            l = max(il - 1, 0) * self.window_size
            r = ir * self.window_size
            # Second stage is binary search within the windows
            il = l + np.searchsorted(self.mz_list[l:r], mz - tol, 'l')
            ir = l + np.searchsorted(self.mz_list[l:r], mz + tol, 'r')
            # slice list for code clarity
            mz_vect=self.mz_list[il:ir]
            idx_vect = self.idx_list[il:ir]
            count_vect = self.count_list[il:ir]
            # bin vectors
            ion_vect = np.bincount(idx_vect, weights=count_vect, minlength=self.max_index + 1)
            data_out.add_xic(ion_vect, [mz], [tol])
        return data_out
        # Form histogram axis

    def generate_histogram_axis(self, ppm=1.):
        ppm_mult = ppm * 1e-6
        mz_current = self.mz_min
        mz_list = [mz_current,]
        while mz_current <= self.mz_max:
            mz_current = mz_current + mz_current * ppm_mult
            mz_list.append(mz_current)
        self.histogram_mz_axis[ppm] = mz_list

    def get_histogram_axis(self, ppm=1.):
        try:
            mz_axis = self.histogram_mz_axis[ppm]
        except KeyError as e:
            print 'generating histogram axis for ppm {}'.format(ppm)
            self.generate_histogram_axis(ppm=ppm)
        return self.histogram_mz_axis[ppm]

    def generate_summary_spectrum(self, summary_type='mean', ppm=1.):
        hist_axis = self.get_histogram_axis(ppm=ppm)
        # calcualte mean along some m/z axis
        mean_spec = np.zeros(np.shape(hist_axis))
        for ii in range(0, len(hist_axis) - 1):
            mz_upper = hist_axis[ii + 1]
            mz_lower = hist_axis[ii]
            idx_left = bisect.bisect_left(self.mz_list, mz_lower)
            idx_right = bisect.bisect_right(self.mz_list, mz_upper)
            # slice list for code clarity
            count_vect = self.count_list[idx_left:idx_right]
            if summary_type == 'mean':
                count_vect = self.count_list[idx_left:idx_right]
                mean_spec[ii] = np.sum(count_vect)
            elif summary_type == 'freq':
                idx_vect = self.idx_list[idx_left:idx_right]
                mean_spec[ii] = float(len(np.unique(idx_vect)))
            else:
                raise ValueError('Summary type not recognised; {}'.format(summary_type))
        if summary_type == 'mean':
            mean_spec = mean_spec / len(self.index_list)
        elif summary_type == 'freq':
            mean_spec = mean_spec / len(self.index_list)
        return hist_axis, mean_spec

    def get_summary_image(self,summary_func='tic'):
        if summary_func not in ['tic','mic']: raise KeyError("requested type not in 'tic' mic'")
        data_out = ion_datacube()
        # add precomputed pixel indices
        data_out.coords = self.coords
        data_out.pixel_indices = self.cube_pixel_indices
        data_out.nRows = self.cube_n_row
        data_out.nColumns = self.cube_n_col
        data_out.add_xic(np.asarray(getattr(self, summary_func))[self.index_list], [0], [0])
        return data_out
Exemplo n.º 34
0

print(len(np.unique(image_UPGMA_pixel1)))
cluster2concensus = {}
cluster2comparison = {}
for cluster in np.unique(image_UPGMA_pixel1):
    print(cluster)
    cluster2concensus[cluster] = consensus.get_consensus(
        cluster, image_UPGMA_pixel1, dist_dot_product, ids, imzMLfile, xs, ys)
    cluster_ids = consensus.get_cluster_elements(cluster, image_UPGMA_pixel1,
                                                 parser, xs, ys)
    tmp = list()
    for i in cluster_ids:
        tmp.append(
            1 - (get_similarity(cluster2concensus[cluster],
                                consensus.tupel2map(parser.getspectrum(i)))))
    cluster2comparison[cluster] = tmp

consensus_distance = np.zeros(
    (len(cluster2concensus.keys()), len(cluster2concensus.keys())))
for cluster1 in range(len(cluster2concensus.keys())):
    for cluster2 in range(cluster1, len(cluster2concensus.keys())):
        consensus_distance[cluster1, cluster2] = consensus_distance[
            cluster2, cluster1] = 1 - get_similarity(
                cluster2concensus[cluster1], cluster2concensus[cluster2])

fig = plt.figure()
grid = plt.GridSpec(len(np.unique(image_UPGMA_pixel1)),
                    3,
                    wspace=0.1,
                    hspace=0.1)
Exemplo n.º 35
0
    def __compute_file_info(cls, filename, resolution):
        ## TODO completely refactor this to make it smartly handle profile or centroid datasets
        ## TODO: centroid datasets should take in a user parameter "Resolution" and resample data at that resolution
        ## TODO: profile datasets should work as is
        ## TODO: checks for profile data vs. centroid data on the variation in length of ['m/z array']
        """
        Internal helper function used to compute the mz axis, data type for the intensities, format type

        :return: Numpy array with mz axis
        :return: string with data type
        :return: imzml file type
        :return:
        """
        reader = ImzMLParser(filename)
        # Read the first spectrum
        mz_axes, intens = reader.getspectrum(0)   # NOTE: mz_axes is a tuple
        # Read the coordinates
        coordinates = np.asarray(reader.coordinates)

        # #Start the data at [0,0,0]
        # coordinates[:,0] = coordinates[:,0] - np.amin(coordinates,axis=0)[0]
        # coordinates[:,1] = coordinates[:,1] - np.amin(coordinates,axis=0)[1]
        # coordinates[:,2] = coordinates[:,2] - np.amin(coordinates,axis=0)[2]

        # Determine the data type for the internsity values
        dtype = np.asarray(intens).dtype.str

        # Compute the mz axis and file type
        file_type = cls.available_imzml_types['continuous']
        min_mz, max_mz = np.amin(mz_axes), np.amax(mz_axes)
        for ind in range(coordinates.shape[0]):      #for ind, loc in enumerate(reader.coordinates):
            mz, intens = reader.getspectrum(ind)
            if mz == mz_axes:
                pass
            else:
                file_type = cls.available_imzml_types['processed']
                if min_mz > np.amin(mz):
                    min_mz = np.amin(mz)
                if max_mz < np.amax(mz):
                    max_mz = np.amax(mz)
        # Reinterpolate the mz-axis if we have a processed mode imzml file
        if file_type == cls.available_imzml_types['processed']:
            f = np.ceil(1e6 * np.log(max_mz/min_mz)/resolution)
            mz_axes = np.logspace(np.log10(min_mz), np.log10(max_mz), f)
            log_helper.info(__name__, "Reinterpolated m/z axis for processed imzML file")

        # Construct the imzml metadata information
        dataset_metadata = metadata_dict()
        instrument_metadata = metadata_dict()
        method_metadata = metadata_dict()
        for k, v in reader.imzmldict.iteritems():
            dataset_metadata[k] = metadata_value(name=k,
                                                 value=v,
                                                 unit=None,
                                                 description=k,
                                                 ontology=None)

        # Delete the parser and read the metadata
        del reader

        # Parse the metadata for the file. We try to parse only the header and ignore the
        # <run > group in the XML file to avoid going throught the whole file again
        # while extracting the majority of the relevant metadata
        try:
            with open(filename, 'r') as ins:
                metdata_header = ''
                for line in ins:
                    if '<run' in line:
                        break
                    else:
                        metdata_header += line
                metdata_header += '</mzML>'
                metdata_header_dict = xmltodict.parse(metdata_header)['mzML']
                for k, v in metdata_header_dict.iteritems():
                    store_value = metadata_value(name=k,
                                                 value=v,
                                                 unit=None,
                                                 description=str(k) + " extracted from imzML XML header.",
                                                 ontology=None)
                    if k == 'instrumentConfigurationList':
                        instrument_metadata[k] = store_value
                    elif k == 'dataProcessingList':
                        method_metadata[k] = store_value
                    elif k == 'scanSettingsList':
                        dataset_metadata[k] = store_value
                    elif k == 'softwareList':
                        method_metadata[k] = store_value
                    elif k =='sampleList':
                        method_metadata[k] = store_value
                    else:
                        dataset_metadata[k] = store_value
                dataset_metadata['imzml_xml_metadata_header'] = metadata_value(name='imzml_xml_metadata_header',
                                                                               value=metdata_header,
                                                                               unit=None,
                                                                               description='XML imzML header',
                                                                               ontology=None)
        except:
            log_helper.warning(__name__, "Extraction of additional imzML metadata failed")

        return coordinates, np.asarray(mz_axes), dtype, file_type, dataset_metadata, instrument_metadata, method_metadata
Exemplo n.º 36
0
class ImzMLHandler:

    # cropToData: (if True) Remove all rows and columns which contain no data. For some datasets (e.g. Bruker)
    # the coordinates stored are relative to some external coordinate system, and therefore large
    # amounts of empty space can be present
    def __init__(self,
                 filename,
                 startX=1,
                 startY=1,
                 width=None,
                 height=None,
                 cropToData=False):
        self.imzML = ImzMLParser(filename)

        # Find the min and max row and column where data is present
        maxWidth = 0
        maxHeight = 0

        minWidth = -1
        minHeight = -1

        for (x, y, z) in self.imzML.coordinates:
            if x > maxWidth:
                maxWidth = x
            if y > maxHeight:
                maxHeight = y
            if minWidth == -1 or minWidth > x:
                minWidth = x
            if minHeight == -1 or minHeight > y:
                minHeight = y

        if cropToData:
            startX = minWidth
            startY = minHeight

        if width is None:
            width = maxWidth - startX + 1
        if height is None:
            height = maxHeight - startY + 1

        self.startX = startX
        self.startY = startY
        self.width = width
        self.height = height
        self.coordinates = []
        self.cropToData = cropToData

        self.indexImage = np.ones((height, width), dtype=np.int) * -1

        index = 0

        for (x, y, z) in self.imzML.coordinates:
            if x >= startX and y >= startY and x < (startX + width) and y < (
                    startY + height):
                if cropToData:
                    self.coordinates.append(
                        (index, x - minWidth + 1, y - minHeight + 1))
                    self.indexImage[y - minHeight, x - minWidth] = index
                else:
                    self.coordinates.append((index, x, y))
                    self.indexImage[y - startY, x - startX] = index

            index = index + 1

    def getSpectrumWithIndex(self, index):
        return self.imzML.getspectrum(index)

    def getSpectrumAt(self, x, y):
        if x <= 0 or y <= 0:
            raise ValueError(
                'Both x and y must be positive (> 0) integers, as per .imzML specification.'
            )

        return self.imzML.getspectrum(self.indexImage[y - 1, x - 1])

    def getTICImage(self):
        ticImage = np.zeros((self.height, self.width))

        for index, x, y in self.coordinates:
            mzs, counts = self.imzML.getspectrum(index)

            #(x, y, z) = imzML.coordinates[index]

            if self.cropToData:
                ticImage[y - 1, x - 1] = np.sum(counts)
            else:
                ticImage[y - self.startY, x - self.startX] = np.sum(counts)

        return ticImage

    def determineMinMaxMZ(self, pixelsToSample=100):
        # TODO: Check in the metadata

        # Alternatively, sample some pixels and see what the min and max recorded
        # m/z values are
        minMZ = -1
        maxMZ = 0

        for i in range(pixelsToSample):
            spectrumToSample = random.randint(0, len(self.coordinates) - 1)
            (index, x, y) = self.coordinates[spectrumToSample]

            mzs, counts = self.imzML.getspectrum(index)

            if minMZ == -1 or mzs[0] < minMZ:
                minMZ = mzs[0]
            if maxMZ < mzs[len(mzs) - 1]:
                maxMZ = mzs[len(mzs) - 1]

        return minMZ, maxMZ

    def estimatePPM(self, minMZ, maxMZ, numBins=10, pixelsToSample=100):
        ppmEstimates = np.ones(numBins) * 1e5

        for i in range(pixelsToSample):
            spectrumToSample = random.randint(0, len(self.coordinates) - 1)
            (index, x, y) = self.coordinates[spectrumToSample]

            mzs, counts = self.imzML.getspectrum(index)

            diff = mzs[1:len(mzs)] - mzs[0:len(mzs) - 1]
            ppms = diff * 1e6 / mzs[0:len(mzs) - 1]

            binWidth = (maxMZ - minMZ) / numBins

            for binNum in range(numBins):
                startMZ = minMZ + (binNum * binWidth)
                endMZ = minMZ + ((binNum + 1) * binWidth)

                possiblePPMs = ppms[np.logical_and(
                    mzs[0:len(mzs) - 1] >= startMZ,
                    mzs[0:len(mzs) - 1] < endMZ)]

                if len(possiblePPMs) > 0:
                    ppmEstimate = np.min(possiblePPMs)

                    if ppmEstimates[binNum] > ppmEstimate:
                        ppmEstimates[binNum] = ppmEstimate

        return ppmEstimates

    def generateMeanSpectrum(self, startmz, endmz, ppm):
        self.mzAxis = ImzMLHandler.generateMZAxis(startmz, endmz, ppm)

        spectrum = np.zeros((self.mzAxis.shape[0] - 1))

        startLog = np.log(self.mzAxis[0])
        ppmLog = np.log(1 + ppm * 1e-6)

        for index, x, y in self.coordinates:
            if index % 10 == 0:
                mzs, counts = self.imzML.getspectrum(index)

                for mzIndex in range(len(mzs)):
                    location = int(
                        np.round((np.log(mzs[mzIndex]) - startLog) / ppmLog))

                    if location < 0:
                        continue

                    if location >= len(spectrum):
                        break

                    spectrum[location] += counts[mzIndex]

        self.meanSpectrum = spectrum / len(self.coordinates)

        return self.meanSpectrum

    def generateIonImage(self, mz, ppm):
        ionImage = np.zeros((self.height, self.width))

        deltamz = ppm * 1e-6 * mz
        minmz = mz - deltamz
        maxmz = mz + deltamz

        for index, x, y in self.coordinates:
            mzs, counts = self.imzML.getspectrum(index)

            for mzIndex in range(len(mzs)):
                if mzs[mzIndex] > maxmz:
                    break

                if mzs[mzIndex] >= minmz and mzs[mzIndex] <= maxmz:
                    ionImage[y - 1, x - 1] += counts[mzIndex]

        return ionImage

    def generateIonImages(self, mzsToGenerate, ppm):
        mzsToGenerate = np.array(mzsToGenerate)
        ionImages = np.zeros((self.height, self.width, len(mzsToGenerate)))

        deltamz = ppm * 1e-6 * mzsToGenerate
        minmz = mzsToGenerate - deltamz
        maxmz = mzsToGenerate + deltamz

        for index, x, y in self.coordinates:
            mzs, counts = self.imzML.getspectrum(index)

            for l in range(len(mzsToGenerate)):
                ionImages[y - 1, x - 1, l] = np.sum(counts[np.logical_and(
                    mzs > minmz[l], mzs <= maxmz[l])])

        return ionImages

    def generateDatacubeMZs(self, limits, ticNorm=False):
        datacube = np.zeros((len(self.coordinates), len(limits)))

        spectrumIndex = 0

        for index, x, y in self.coordinates:
            mzs, counts = self.imzML.getspectrum(index)

            # Normalised to TIC
            if ticNorm:
                counts = counts / np.sum(counts)

            for l in range(len(limits)):
                datacube[spectrumIndex, l] = np.sum(counts[np.logical_and(
                    mzs > limits[l, 0], mzs <= limits[l, 1])])

            spectrumIndex += 1

        self.datacube = datacube

        return self.datacube

    def generateDatacube(self, peaks, left_ips, right_ips, ticNorm=False):
        #left_ips = peakProperties['left_ips']
        left_ips = np.floor(left_ips).astype(np.int) - 1
        #right_ips = peakProperties['right_ips']
        right_ips = np.ceil(right_ips).astype(np.int) + 1

        datacube = np.zeros((len(self.coordinates), len(peaks)))

        spectrumIndex = 0

        for index, x, y in self.coordinates:
            mzs, counts = self.imzML.getspectrum(index)

            # Normalised to TIC
            if ticNorm:
                counts = counts / np.sum(counts)

            curPeakIndex = 0

            for mzIndex in range(len(mzs)):
                while curPeakIndex < len(peaks) and mzs[mzIndex] > self.mzAxis[
                        right_ips[curPeakIndex]]:
                    curPeakIndex += 1
                if curPeakIndex >= len(peaks):
                    break

                for peakIndex in range(curPeakIndex, len(peaks)):
                    if mzs[mzIndex] < self.mzAxis[left_ips[peakIndex]]:
                        break
                    if mzs[mzIndex] >= self.mzAxis[left_ips[
                            peakIndex]] and mzs[mzIndex] <= self.mzAxis[
                                right_ips[peakIndex]]:
                        datacube[spectrumIndex, peakIndex] += counts[mzIndex]
                        break

            spectrumIndex += 1

        self.datacube = datacube

        return self.datacube

    def determineCorrelatedFeatures(self, clusteringThreshold):
        ionCorrelationMatrix = np.corrcoef(self.datacube.transpose())
        ionCorrelationMatrix[np.isnan(ionCorrelationMatrix)] = 0

        clustering = AgglomerativeClustering(
            n_clusters=None,
            distance_threshold=clusteringThreshold).fit(ionCorrelationMatrix)

        u, c = np.unique(clustering.labels_, return_counts=True)

        #nonUniqueClusters = np.where(c > 1)[0]

        self.uniqueData = []
        self.uniqueDataMembers = []

        for index in u:
            clusterIndex = u[index]
            #print(clusterIndex)

            clusterMembers = np.where(clustering.labels_ == clusterIndex)[0]

            self.uniqueDataMembers.append(clusterMembers)

            averageIonImage = np.mean(self.datacube[:, clusterMembers], axis=1)
            averageIonImage = np.reshape(averageIonImage,
                                         (len(averageIonImage), 1))

            if self.uniqueData == []:
                self.uniqueData = averageIonImage
            else:
                self.uniqueData = np.concatenate(
                    (self.uniqueData, averageIonImage), axis=1)

        return self.uniqueData

    @staticmethod
    def generateMZAxis(startmz, endmz, ppm):

        numElements = int(
            np.ceil(
                (np.log(endmz) - np.log(startmz)) / np.log(1 + ppm * 1e-6)))

        mzAxis = np.zeros((numElements))

        for i in range(numElements):
            mzAxis[i] = startmz * np.power(1 + ppm * 1e-6, i)

        return mzAxis
Exemplo n.º 37
0
        order = mzs.argsort()
        return mzs[order], intensities[order]

    def saveStatistics(self, filename):
        def toRect(d):
            xs = [k[0] for k in d]
            ys = [k[1] for k in d]
            img = np.zeros((max(xs) + 1, max(ys) + 1))
            for k in d:
                img[k[0], k[1]] = d[k]
            return img

        with open(filename, "w+") as f:
            np.savez(f,
                     real=toRect(self._norm_real),
                     simulated=toRect(self._norm_simulated),
                     groundtruth=toRect(self._norm_groundtruth),
                     noise=toRect(self._norm_noise),
                     diff=toRect(self._norm_diff))

ng = NoiseGenerator(args.nmf, args.layers, args.real)

imzml_sim = ImzMLParser(args.simclean)

with ImzMLWriter(args.output, mz_dtype=np.float32) as w:
    for i, coords in enumerate(imzml_sim.coordinates):
        noisy_mzs, noisy_intensities = ng.addNoise(imzml_sim.getspectrum(i), coords)
        w.addSpectrum(noisy_mzs, noisy_intensities, coords)

ng.saveStatistics(args.output + ".norms")
Exemplo n.º 38
0
class NoiseGenerator(object):
    def __init__(self, nmf_fn, layers_fn, imzml_fn):
        self._imzml = ImzMLParser(imzml_fn)
        with np.load(nmf_fn) as data:
            nx, ny = data['shape']
            self._W = data['W'].reshape((nx, ny, -1))
            self._H = data['H']
            self._mz_axis = data['mz_axis']
        self._norm_real = {}
        self._norm_simulated = {}
        self._norm_groundtruth = {}
        self._norm_noise = {}
        self._norm_diff = {}
        self._coords = {}
        for i, coords in enumerate(self._imzml.coordinates):
            self._coords[(coords[0], coords[1])] = i
        self._mz_bins = []
        for mz, ppm in self._mz_axis:
            self._mz_bins.append(mz * (1.0 + 1e-6 * ppm))

        # self._removeAssignedBins(layers_fn)

    def _removeAssignedBins(self, layers_fn):
        # buggy at the moment
        with open(layers_fn, 'rb') as f:
            layers = cPickle.load(f)
        for i in layers['layers_list']:
            assigned = layers['layers_list'][i]['assigned_mz_bins']
            assigned = assigned[assigned < self._H[i].shape[0]]
            print "#assigned bins in component #{}: {}".format(i + 1, len(assigned))
            h = np.zeros_like(self._H[i])
            h[assigned] = self._H[i][assigned]
            self._H[i] = h

    def _getRealSpectrum(self, x, y):
        return self._imzml.getspectrum(self._coords[(x, y)])

    def _norm(self, intensities):
        return np.linalg.norm(intensities)

    def generateNoise(self, x, y):
        real_spectrum = self._getRealSpectrum(x, y)
        real_mzs, real_intensities = map(np.array, real_spectrum)

        min_mz, max_mz = self._mz_bins[0], self._mz_bins[-1]
        inside_range = (real_mzs >= min_mz) & (real_mzs <= max_mz)
        real_mzs = real_mzs[inside_range]
        real_intensities = real_intensities[inside_range]

        bins = np.digitize(real_mzs, self._mz_bins)
        n_bins = len(self._mz_bins)
        binned_real_intensities = np.bincount(bins, real_intensities, n_bins)
        self._norm_real[(x, y)] = self._norm(binned_real_intensities)
        binned_approx_intensities = self._W[x, y, :].dot(self._H)
        noise = np.abs(binned_real_intensities - binned_approx_intensities)
        # FIXME: avoid duplicating noise
        noise_intensities = noise[bins] * args.inflate_noise
        noise_mzs = np.array(real_mzs)
        nnz = noise_intensities > min(real_intensities) / 2
        return noise_mzs[nnz], noise_intensities[nnz]

    def addNoise(self, profile_spectrum, coords):
        spec = map(np.array, profile_spectrum)
        p = centroidize(*spec)
        mzs = np.array(p.masses)
        mult = spec[1].max() if len(spec[1]) > 0 else 1
        intensities = np.array(p.abundances) * mult

        x, y = coords[:2]
        limit = min(self._getRealSpectrum(*coords)[1])

        noise_mzs, noise_intensities = self.generateNoise(*coords)
        self._norm_noise[(x, y)] = self._norm(noise_intensities[noise_intensities > limit])
        self._norm_groundtruth[(x, y)] = self._norm(intensities[intensities > limit])
        self._norm_simulated[(x, y)] = self._norm_noise[(x, y)] + self._norm_groundtruth[(x, y)]
        self._norm_diff[(x, y)] = abs(self._norm_simulated[(x, y)] - self._norm_real[(x, y)])
        mzs = np.concatenate([mzs, noise_mzs])
        intensities = np.concatenate([intensities, noise_intensities])

        detectable = np.where(intensities > limit)[0]
        mzs = mzs[detectable]
        intensities = intensities[detectable]

        order = mzs.argsort()
        return mzs[order], intensities[order]

    def saveStatistics(self, filename):
        def toRect(d):
            xs = [k[0] for k in d]
            ys = [k[1] for k in d]
            img = np.zeros((max(xs) + 1, max(ys) + 1))
            for k in d:
                img[k[0], k[1]] = d[k]
            return img

        with open(filename, "w+") as f:
            np.savez(f,
                     real=toRect(self._norm_real),
                     simulated=toRect(self._norm_simulated),
                     groundtruth=toRect(self._norm_groundtruth),
                     noise=toRect(self._norm_noise),
                     diff=toRect(self._norm_diff))