def test_select_by_relative_intensity_with_to_parameter_too_large(): mz = numpy.array([10, 20, 30, 40], dtype="float") intensities = numpy.array([1, 10, 100, 1000], dtype="float") spectrum_in = Spectrum(mz=mz, intensities=intensities) with pytest.raises(AssertionError): select_by_relative_intensity(spectrum_in, intensity_to=10.0)
def post_process_normal(spectrum_in: SpectrumType, min_peaks: int = 10) \ -> Union[SpectrumType, None]: """Normal processing of spectra for Spec2Vec Parameters ---------- spectrum_in: Input spectrum. min_peaks: Minimum number of peaks to pass the spectrum (otherwise -> None) """ if spectrum_in is None: return None s = spectrum_in.clone() s = normalize_intensities(s) if any(np.isnan(s.peaks[1])): return None # remove spectra that have all intensities 0 s = select_by_mz(s, mz_from=0, mz_to=1000) s = require_minimum_number_of_peaks(s, n_required=min_peaks) s = reduce_to_number_of_peaks(s, n_required=min_peaks, ratio_desired=0.5) if s is None: return None # remove low peaks unless less than 10 peaks are left s_remove_low_peaks = select_by_relative_intensity(s, intensity_from=0.001) if len(s_remove_low_peaks.peaks) >= 10: s = s_remove_low_peaks # add losses to normally processed spectra s = add_losses(s, loss_mz_from=5.0, loss_mz_to=200.0) return s
def test_select_by_relative_intensity(spectrum_in, intensity_from, intensity_to, expected_mz, expected_intensities): spectrum = select_by_relative_intensity(spectrum_in, intensity_from=intensity_from, intensity_to=intensity_to) assert spectrum.peaks.mz.size == len(expected_mz) assert spectrum.peaks.mz.size == spectrum.peaks.intensities.size assert numpy.array_equal(spectrum.peaks.mz, expected_mz) assert numpy.array_equal(spectrum.peaks.intensities, expected_intensities)
def apply_my_filters(s): s = default_filters(s) s = add_parent_mass(s) s = normalize_intensities(s) s = select_by_relative_intensity(s, intensity_from=0.0, intensity_to=1.0) s = select_by_mz(s, mz_from=0, mz_to=1000) s = require_minimum_number_of_peaks(s, n_required=5) return s
def test_select_by_relative_intensity_with_empty_peaks(): """Within certain workflows it can happen that spectrums are passed which have empty arrays as peaks. Functions shouldn't break in those cases.""" spectrum_in = SpectrumBuilder().build() spectrum = select_by_relative_intensity(spectrum_in, intensity_from=0.01, intensity_to=0.99) assert spectrum == spectrum_in, "Spectrum should remain unchanged."
def post_process_md(spectrum_in: SpectrumType, low_int_cutoff: float = 0.05, min_peaks: int = 10, max_peaks: int = 30) -> Union[SpectrumType, None]: """Processing of spectra that are used for mass difference extraction Parameters ---------- spectrum_in: Input spectrum. low_int_cutoff: Lower intensity cutoff for the peaks selected for MD min_peaks: Minimum number of peaks to pass the spectrum (otherwise -> None) max_peaks: Maximum number of peaks allowed in the spectrum (ranked on intensity) """ if spectrum_in is None: return None s = spectrum_in.clone() # remove precurzor_mz from spectra so neutral losses don't end up in MDs s = remove_precursor_mz_peak(s) s = normalize_intensities(s) if any(np.isnan(s.peaks[1])): return None # remove spectra that have all intensities 0 s = select_by_mz(s, mz_from=0, mz_to=1000) s = require_minimum_number_of_peaks(s, n_required=min_peaks) s = reduce_to_number_of_peaks(s, n_required=min_peaks, ratio_desired=0.5) if s is None: return None # remove low peaks unless less than 10 peaks are left s_remove_low_peaks = select_by_relative_intensity(s, intensity_from=0.001) if len(s_remove_low_peaks.peaks) >= 10: s = s_remove_low_peaks # do an additional removal step with a different intensity cutoff s_second_peak_removal = select_by_relative_intensity( s, intensity_from=low_int_cutoff) if len(s_second_peak_removal.peaks) >= 10: s = s_second_peak_removal # reduce to top30 peaks s = reduce_to_number_of_peaks(s, n_required=min_peaks, n_max=max_peaks) return s
def test_select_by_relative_intensity_with_empty_peaks(): """Within certain workflows it can happen that spectrums are passed which have empty arrays as peaks. Functions shouldn't break in those cases.""" mz = numpy.array([], dtype="float") intensities = numpy.array([], dtype="float") spectrum_in = Spectrum(mz=mz, intensities=intensities) spectrum = select_by_relative_intensity(spectrum_in, intensity_from=0.01, intensity_to=0.99) assert spectrum == spectrum_in, "Spectrum should remain unchanged."
def test_select_by_relative_intensity_with_from_and_to_parameters(): mz = numpy.array([10, 20, 30, 40], dtype="float") intensities = numpy.array([1, 10, 100, 1000], dtype="float") spectrum_in = Spectrum(mz=mz, intensities=intensities) spectrum = select_by_relative_intensity(spectrum_in, intensity_from=0.01, intensity_to=0.99) assert spectrum.peaks.mz.size == 2 assert spectrum.peaks.mz.size == spectrum.peaks.intensities.size assert numpy.array_equal(spectrum.peaks.mz, numpy.array([20, 30], dtype="float")) assert numpy.array_equal(spectrum.peaks.intensities, numpy.array([10, 100], dtype="float"))
def test_select_by_relative_intensity_no_parameters(): mz = numpy.array([10, 20, 30, 40], dtype="float") intensities = numpy.array([1, 10, 100, 1000], dtype="float") spectrum_in = Spectrum(mz=mz, intensities=intensities) spectrum = select_by_relative_intensity(spectrum_in) assert spectrum.peaks.mz.size == 4 assert spectrum.peaks.mz.size == spectrum.peaks.intensities.size assert numpy.array_equal(spectrum.peaks.mz, numpy.array([10, 20, 30, 40], dtype="float")) assert numpy.array_equal(spectrum.peaks.intensities, numpy.array([1, 10, 100, 1000], dtype="float"))
def post_process(s): s = normalize_intensities(s) s = select_by_mz(s, mz_from=0, mz_to=1000) s = require_minimum_number_of_peaks(s, n_required=10) try: s = reduce_to_number_of_peaks(s, n_required=10, ratio_desired=0.5) except: pass if s is None: return None s_remove_low_peaks = select_by_relative_intensity(s, intensity_from=0.001) if len(s_remove_low_peaks.peaks) >= 10: s = s_remove_low_peaks s = add_losses(s, loss_mz_from=5.0, loss_mz_to=200.0) return s
def require_minimum_of_high_peaks(spectrum_in, no_peaks=5, intensity_percent=2.0): if spectrum_in is None: return None spectrum = spectrum_in.clone() assert no_peaks >= 1, "no_peaks must be a positive nonzero integer." assert 0 <= intensity_percent <= 100, "intensity_percent must be a scalar between 0-100." intensities_above_p = select_by_relative_intensity( spectrum, intensity_from=intensity_percent / 100, intensity_to=1.0) if len(intensities_above_p.peaks) < no_peaks: return None return spectrum
def post_process_classical(spectrum_in: SpectrumType, min_peaks: int = 10) \ -> Union[SpectrumType, None]: """Processing of spectra for calculating classical scores Parameters ---------- spectrum_in: Input spectrum. min_peaks: Minimum number of peaks to pass the spectrum (otherwise -> None) """ if spectrum_in is None: return None s = spectrum_in.clone() s = normalize_intensities(s) if any(np.isnan(s.peaks[1])): return None # remove spectra that have all intensities 0 s = select_by_mz(s, mz_from=0, mz_to=1000) s = require_minimum_number_of_peaks(s, n_required=min_peaks) s = select_by_relative_intensity(s, intensity_from=0.01, intensity_to=1.0) return s
def apply_filters(s): s = normalize_intensities(s) s = select_by_mz(s, mz_from=min_mz, mz_to=max_mz) s = select_by_relative_intensity(s, intensity_from=intensity_threshold) s.losses = None return s
def main(argv): parser = argparse.ArgumentParser( description="Compute MSP similarity scores") parser.add_argument("--spectra", type=str, required=True, help="Mass spectra file to be filtered.") parser.add_argument("--spectra_format", type=str, required=True, help="Format of spectra file.") parser.add_argument("--output", type=str, required=True, help="Filtered mass spectra file.") parser.add_argument( "-normalise_intensities", action='store_true', help="Normalize intensities of peaks (and losses) to unit height.") parser.add_argument( "-default_filters", action='store_true', help= "Collection of filters that are considered default and that do no require any (factory) arguments." ) parser.add_argument( "-clean_metadata", action='store_true', help= "Apply all adding and cleaning filters if possible, so that the spectra have canonical metadata." ) parser.add_argument( "-relative_intensity", action='store_true', help= "Keep only peaks within set relative intensity range (keep if to_intensity >= intensity >= from_intensity)." ) parser.add_argument("--from_intensity", type=float, help="Lower bound for intensity filter") parser.add_argument("--to_intensity", type=float, help="Upper bound for intensity filter") parser.add_argument( "-mz_range", action='store_true', help= "Keep only peaks between set m/z range (keep if to_mz >= m/z >= from_mz)." ) parser.add_argument("--from_mz", type=float, help="Lower bound for m/z filter") parser.add_argument("--to_mz", type=float, help="Upper bound for m/z filter") args = parser.parse_args() if not (args.normalise_intensities or args.default_filters or args.clean_metadata or args.relative_intensity or args.mz_range): raise ValueError('No filter selected.') if args.spectra_format == 'msp': spectra = list(load_from_msp(args.spectra)) elif args.queries_format == 'mgf': spectra = list(load_from_mgf(args.spectra)) else: raise ValueError( f'File format {args.spectra_format} not supported for mass spectra file.' ) filtered_spectra = [] for spectrum in spectra: if args.normalise_intensities: spectrum = normalize_intensities(spectrum) if args.default_filters: spectrum = default_filters(spectrum) if args.clean_metadata: filters = [ add_compound_name, add_precursor_mz, add_fingerprint, add_losses, add_parent_mass, add_retention_index, add_retention_time, clean_compound_name ] for metadata_filter in filters: spectrum = metadata_filter(spectrum) if args.relative_intensity: spectrum = select_by_relative_intensity(spectrum, args.from_intensity, args.to_intensity) if args.mz_range: spectrum = select_by_mz(spectrum, args.from_mz, args.to_mz) filtered_spectra.append(spectrum) if args.spectra_format == 'msp': save_as_msp(filtered_spectra, args.output) else: save_as_mgf(filtered_spectra, args.output) return 0
def test_select_by_relative_intensity_with_to_parameter_too_large(spectrum_in: Spectrum): with pytest.raises(AssertionError): select_by_relative_intensity(spectrum_in, intensity_to=10.0)
def test_select_by_relative_intensity_with_from_parameter_too_small(spectrum_in: Spectrum): with pytest.raises(AssertionError): select_by_relative_intensity(spectrum_in, intensity_from=-10.0)