def require_peaks_below_mz(spectrum_in: SpectrumType, n_required: int = 10, max_mz: float = 1000.0) -> SpectrumType: """Spectrum will be set to None when it has fewer peaks than required. Args: ---------- spectrum_in: Input spectrum. n_required: Number of minimum required peaks. Spectra with fewer peaks will be set to 'None'. max_mz: Only peaks <= max_mz will be counted to check if spectrum contains sufficient peaks to be considered (>= n_required). """ if spectrum_in is None: return None spectrum = spectrum_in.clone() if spectrum.peaks.mz[spectrum.peaks.mz < max_mz].size < n_required: return None return spectrum
def post_process_normal(spectrum_in: SpectrumType, min_peaks: int = 10) \ -> Union[SpectrumType, None]: """Normal processing of spectra for Spec2Vec Parameters ---------- spectrum_in: Input spectrum. min_peaks: Minimum number of peaks to pass the spectrum (otherwise -> None) """ if spectrum_in is None: return None s = spectrum_in.clone() s = normalize_intensities(s) if any(np.isnan(s.peaks[1])): return None # remove spectra that have all intensities 0 s = select_by_mz(s, mz_from=0, mz_to=1000) s = require_minimum_number_of_peaks(s, n_required=min_peaks) s = reduce_to_number_of_peaks(s, n_required=min_peaks, ratio_desired=0.5) if s is None: return None # remove low peaks unless less than 10 peaks are left s_remove_low_peaks = select_by_relative_intensity(s, intensity_from=0.001) if len(s_remove_low_peaks.peaks) >= 10: s = s_remove_low_peaks # add losses to normally processed spectra s = add_losses(s, loss_mz_from=5.0, loss_mz_to=200.0) return s
def remove_precursor_mz_peak(spectrum_in: SpectrumType) -> SpectrumType: """Remove the peak for precursor_mz in the spectrum (if it exists) Parameters ---------- spectrum_in: Input spectrum. """ if spectrum_in is None: return None spectrum = spectrum_in.clone() prec_mz = spectrum.get("precursor_mz") before_len = len(spectrum.peaks) if prec_mz: # precursor_mz exists mzs, intensities = spectrum.peaks.clone() prec_mz_i = [i for i, mz in enumerate(mzs) if mz == prec_mz] if prec_mz_i: # precursor_mz peak exists -> remove it new_mzs = np.delete(mzs, prec_mz_i) new_intensities = np.delete(intensities, prec_mz_i) new_spikes = Spikes(mz=new_mzs, intensities=new_intensities) spectrum.peaks = new_spikes after_len = len(spectrum.peaks) assert after_len == before_len - 1, \ "Expected only one peak to have been removed" return spectrum
def normalize_intensities(spectrum_in: SpectrumType) -> SpectrumType: """Normalize intensities of peaks (and losses) to unit height.""" if spectrum_in is None: return None spectrum = spectrum_in.clone() if len(spectrum.peaks) == 0: return spectrum max_intensity = numpy.max(spectrum.peaks.intensities) # Normalize peak intensities mz, intensities = spectrum.peaks normalized_intensities = intensities / max_intensity spectrum.peaks = Spikes(mz=mz, intensities=normalized_intensities) # Normalize loss intensities if spectrum.losses is not None and len(spectrum.losses) > 0: mz, intensities = spectrum.losses normalized_intensities = intensities / max_intensity spectrum.losses = Spikes(mz=mz, intensities=normalized_intensities) return spectrum
def normalize_intensities(spectrum_in: SpectrumType) -> SpectrumType: """Normalize intensities to unit height.""" if spectrum_in is None: return None spectrum = spectrum_in.clone() if len(spectrum.peaks) > 0: scale_factor = numpy.max(spectrum.peaks.intensities) mz, intensities = spectrum.peaks normalized_intensities = intensities / scale_factor spectrum.peaks = Spikes(mz=mz, intensities=normalized_intensities) return spectrum
def post_process_md(spectrum_in: SpectrumType, low_int_cutoff: float = 0.05, min_peaks: int = 10, max_peaks: int = 30) -> Union[SpectrumType, None]: """Processing of spectra that are used for mass difference extraction Parameters ---------- spectrum_in: Input spectrum. low_int_cutoff: Lower intensity cutoff for the peaks selected for MD min_peaks: Minimum number of peaks to pass the spectrum (otherwise -> None) max_peaks: Maximum number of peaks allowed in the spectrum (ranked on intensity) """ if spectrum_in is None: return None s = spectrum_in.clone() # remove precurzor_mz from spectra so neutral losses don't end up in MDs s = remove_precursor_mz_peak(s) s = normalize_intensities(s) if any(np.isnan(s.peaks[1])): return None # remove spectra that have all intensities 0 s = select_by_mz(s, mz_from=0, mz_to=1000) s = require_minimum_number_of_peaks(s, n_required=min_peaks) s = reduce_to_number_of_peaks(s, n_required=min_peaks, ratio_desired=0.5) if s is None: return None # remove low peaks unless less than 10 peaks are left s_remove_low_peaks = select_by_relative_intensity(s, intensity_from=0.001) if len(s_remove_low_peaks.peaks) >= 10: s = s_remove_low_peaks # do an additional removal step with a different intensity cutoff s_second_peak_removal = select_by_relative_intensity( s, intensity_from=low_int_cutoff) if len(s_second_peak_removal.peaks) >= 10: s = s_second_peak_removal # reduce to top30 peaks s = reduce_to_number_of_peaks(s, n_required=min_peaks, n_max=max_peaks) return s
def post_process_classical(spectrum_in: SpectrumType, min_peaks: int = 10) \ -> Union[SpectrumType, None]: """Processing of spectra for calculating classical scores Parameters ---------- spectrum_in: Input spectrum. min_peaks: Minimum number of peaks to pass the spectrum (otherwise -> None) """ if spectrum_in is None: return None s = spectrum_in.clone() s = normalize_intensities(s) if any(np.isnan(s.peaks[1])): return None # remove spectra that have all intensities 0 s = select_by_mz(s, mz_from=0, mz_to=1000) s = require_minimum_number_of_peaks(s, n_required=min_peaks) s = select_by_relative_intensity(s, intensity_from=0.01, intensity_to=1.0) return s
def get_mass_differences(spectrum_in: SpectrumType, multiply: bool = False, max_mds_per_peak: int = 30, cutoff: int = 36, n_max: int = 100) -> Union[Spikes, None]: """Returns Spikes with top X mass differences and intensities Parameters ---------- spectrum_in: Spectrum in matchms.Spectrum format multiply: Multiply parent peak intensities instead of taking the mean max_mds_per_peak: Maximum amount of MDs that can originate from one peak, ranked on intensity. The minimum is 2 (with this implementation) cutoff: Mass cutoff for mass difference (default like Xing et al.) n_max: Maximum amount of mass differences to select, ranked on intensity (default like Xing et al.) """ if spectrum_in is None: return None spectrum = spectrum_in.clone() peaks_mz_ori, peaks_intensities_ori = spectrum.peaks # sort on intensities to allow for max_mds_per_peak selection sort_idx = peaks_intensities_ori.argsort()[::-1] peaks_intensities = peaks_intensities_ori[sort_idx] peaks_mz = peaks_mz_ori[sort_idx] # for every peak, calculate MDs to all other peaks mass_diff_mz = [] mass_diff_intensities = [] used_mz_dict = {mz_val: 0 for mz_val in peaks_mz} # keep track of used mz for i, (mz_i, int_i) in enumerate(zip(peaks_mz[:-1], peaks_intensities[:-1])): cur = used_mz_dict[mz_i] # number of uses of this peak allowed = max_mds_per_peak - cur # still allowed uses for mz_j, int_j in zip(peaks_mz[i + 1:i + 1 + allowed], peaks_intensities[i + 1:i + 1 + allowed]): # update used peaks dict used_mz_dict[mz_i] += 1 used_mz_dict[mz_j] += 1 # calculate mass difference mz_diff = mz_j - mz_i if mz_diff > cutoff: mass_diff_mz.append(mz_diff) if multiply: new_intensity = int_i * int_j else: new_intensity = np.mean([int_i, int_j]) mass_diff_intensities.append(new_intensity) # sort on mz mass_diff_mz = np.array(mass_diff_mz) mass_diff_intensities = np.array(mass_diff_intensities) idx = mass_diff_intensities.argsort()[-n_max:] idx_sort_by_mz = mass_diff_mz[idx].argsort() mass_diff_peaks = Spikes( mz=mass_diff_mz[idx][idx_sort_by_mz], intensities=mass_diff_intensities[idx][idx_sort_by_mz]) return mass_diff_peaks