Exemplo n.º 1
0
    def __init__(self,
                 filename,
                 min_mz=0,
                 max_mz=np.inf,
                 min_int=0,
                 tmpdir=None,
                 load=False):
        """
        :param filename: 
        :param min_mz: 
        :param max_mz: 
        :param min_int: 
        """
        super(DiskIndexedDataset, self).__init__(filename)
        self.outOfMemoryDataset = imsDataset(filename)
        self.coordinates = self.outOfMemoryDataset.coordinates
        step_size = self.outOfMemoryDataset.step_size
        cube = ion_datacube(step_size=step_size)
        cube.add_coords(self.coordinates)
        self.cube_pixel_indices = cube.pixel_indices
        self.cube_n_row, self.cube_n_col = cube.nRows, cube.nColumns
        self._set_tmpdir(tmpdir)
        #if load & os.path.exists(self._tmp_file_name):
        #    self.index
        #else:

        if load & os.path.exists(self._tmp_file_name) & os.path.exists(
                self._index_file_name):
            self._read_index()
        else:
            print('creating optimised on-disk structure')
            self.dump_sorted_by_mzs(min_mz, max_mz, min_int, load=load)
        self.mz_list = DiskList(self.index, self._tmp_file_name, 'mzs')
        self.count_list = DiskList(self.index, self._tmp_file_name, 'ints')
Exemplo n.º 2
0
 def load_file(self,
               outOfMemoryDataset,
               min_mz,
               max_mz,
               min_int,
               index_range=[],
               spectrum_type='centroids'):
     # parse file to get required parameters
     # can use thin hdf5 wrapper for getting data from file
     self.file_dir, self.filename = os.path.split(
         outOfMemoryDataset.filename)
     self.filename, self.file_type = os.path.splitext(self.filename)
     self.coordinates = outOfMemoryDataset.coordinates
     step_size = outOfMemoryDataset.step_size
     cube = ion_datacube(step_size=step_size)
     cube.add_coords(self.coordinates)
     self.cube_pixel_indices = cube.pixel_indices
     self.cube_n_row, self.cube_n_col = cube.nRows, cube.nColumns
     self.spectrum_type = spectrum_type  # fixme this should be read from the base file during get_spectrum?
     # load data into memory
     self.mz_list = []
     self.count_list = []
     self.idx_list = []
     for ii in range(len(self.coordinates)):
         # load spectrum, keep values gt0 (shouldn't be here anyway)
         mzs, counts = outOfMemoryDataset.get_spectrum(ii)
         if len(mzs) != len(counts):
             raise TypeError(
                 'length of mzs ({}) not equal to counts ({})'.format(
                     len(mzs), len(counts)))
         # Enforce data limits
         valid = np.where((mzs > min_mz) & (mzs < max_mz)
                          & (counts > min_int))
         counts = counts[valid]
         mzs = mzs[valid]
         # update min/max
         # append ever-growing lists (should probably be preallocated or piped to disk and re-loaded)
         self.mz_list.append(mzs)
         self.count_list.append(counts)
         self.idx_list.append(np.ones(len(mzs), dtype=int) * ii)
     print('loaded spectra')
     self.mz_list = np.concatenate(self.mz_list)
     self.count_list = np.concatenate(self.count_list)
     self.idx_list = np.concatenate(self.idx_list)
     # sort by mz for fast image formation
     mz_order = np.argsort(self.mz_list)
     self.mz_list = self.mz_list[mz_order]
     self.count_list = self.count_list[mz_order]
     self.idx_list = self.idx_list[mz_order]
     self.mz_min = self.mz_list[0]
     self.mz_max = self.mz_list[-1]
     # split binary searches into two stages for better locality
     self.window_size = 1024
     self.mz_sublist = self.mz_list[::self.window_size].copy()
     print('file loaded')
     self.outOfMemoryDataset = outOfMemoryDataset
     self.nSpectra = len(self.coordinates)
     self.tic = np.bincount(self.idx_list,
                            weights=self.count_list,
                            minlength=self.nSpectra)
Exemplo n.º 3
0
 def empty_datacube(self):
     data_out = ion_datacube()
     # add precomputed pixel indices
     data_out.coords = self.coords
     data_out.pixel_indices = self.cube_pixel_indices
     data_out.nRows = self.cube_n_row
     data_out.nColumns = self.cube_n_col
     return data_out
Exemplo n.º 4
0
    def load_file(self,
                  filename,
                  min_mz=0,
                  max_mz=np.inf,
                  min_int=0,
                  index_range=[],
                  cache_spectra=True,
                  do_summary=True,
                  norm=[],
                  norm_args={},
                  spectrum_type='centroids'):
        # parse file to get required parameters
        # can use thin hdf5 wrapper for getting data from file
        self.file_dir, self.filename = os.path.split(filename)
        self.filename, self.file_type = os.path.splitext(self.filename)
        self.file_type = self.file_type.lower()
        self.norm = norm.lower()
        self.norm_args = norm_args
        if self.file_type == '.hdf5':
            import h5py
            self.hdf = h5py.File(filename, 'r')  # Readonly, fie must exist
            if index_range == []:
                self.index_list = map(int, self.hdf['/spectral_data'].keys())
            else:
                self.index_list = index_range
        elif self.file_type == '.imzml':
            from pyimzml.ImzMLParser import ImzMLParser
            self.imzml = ImzMLParser(filename)
            self.index_list = range(0, len(self.imzml.coordinates))
        else:
            raise TypeError('File type not recogised: {}'.format(
                self.file_type))
        self.max_index = max(self.index_list)
        self.coords = self.get_coords()
        step_size = self.get_step_size()
        cube = ion_datacube(step_size=step_size)
        cube.add_coords(self.coords)
        self.cube_pixel_indices = cube.pixel_indices
        self.cube_n_row, self.cube_n_col = cube.nRows, cube.nColumns
        self.histogram_mz_axis = {}
        self.mz_min = 9999999999999.
        self.mz_max = 0.
        self.spectrum_type = spectrum_type  #todo this should be read for imzml files, not coded as an input
        if any([cache_spectra, do_summary]) == True:
            # load data into memory
            self.mz_list = []
            self.count_list = []
            self.idx_list = []
            if do_summary:
                self.mic = np.zeros((len(self.index_list), 1))
                self.tic = np.zeros((len(self.index_list), 1))
            for ii in self.index_list:
                # load spectrum, keep values gt0 (shouldn't be here anyway)
                this_spectrum = self.get_spectrum(ii)
                mzs, counts = this_spectrum.get_spectrum(source=spectrum_type)
                if len(mzs) != len(counts):
                    raise TypeError(
                        'length of mzs ({}) not equal to counts ({})'.format(
                            len(mzs), len(counts)))
                # Enforce data limits
                valid = np.where((mzs > min_mz) & (mzs < max_mz)
                                 & (counts > min_int))
                counts = counts[valid]
                mzs = mzs[valid]
                # record min/max

                if not len(mzs) == 0:
                    if mzs[0] < self.mz_min:
                        self.mz_min = mzs[0]
                    if mzs[-1] > self.mz_max:
                        self.mz_max = mzs[-1]
                    #record summary values
                    if do_summary:
                        self.tic[ii] = sum(counts)
                        self.mic[ii] = max(counts)
                # append ever-growing lists (should probably be preallocated or piped to disk and re-loaded)
                if cache_spectra:
                    self.mz_list.append(mzs)
                    self.count_list.append(counts)
                    self.idx_list.append(np.ones(len(mzs), dtype=int) * ii)

            print 'loaded spectra'
            if cache_spectra:
                self.mz_list = np.concatenate(self.mz_list)
                self.count_list = np.concatenate(self.count_list)
                self.idx_list = np.concatenate(self.idx_list)
                # sort by mz for fast image formation
                mz_order = np.argsort(self.mz_list)
                self.mz_list = self.mz_list[mz_order]
                self.count_list = self.count_list[mz_order]
                self.idx_list = self.idx_list[mz_order]
                # split binary searches into two stages for better locality
                self.window_size = 1024
                self.mz_sublist = self.mz_list[::self.window_size].copy()
        print 'file loaded'