def test_modified_cosine_order_of_input_spectrums(): """Test modified cosine on two spectra in changing order.""" spectrum_1 = Spectrum(mz=numpy.array([100, 150, 200, 300, 500, 510, 1100], dtype="float"), intensities=numpy.array( [700, 200, 100, 1000, 200, 5, 500], dtype="float"), metadata={"precursor_mz": 1000.0}) spectrum_2 = Spectrum( mz=numpy.array([55, 105, 205, 304.5, 494.5, 515.5, 1045], dtype="float"), intensities=numpy.array([700, 200, 100, 1000, 200, 5, 500], dtype="float"), metadata={"precursor_mz": 1005.0}) norm_spectrum_1 = normalize_intensities(spectrum_1) norm_spectrum_2 = normalize_intensities(spectrum_2) modified_cosine = ModifiedCosine(tolerance=2.0) score_1_2, n_matches_1_2 = modified_cosine.pair(norm_spectrum_1, norm_spectrum_2) score_2_1, n_matches_2_1 = modified_cosine.pair(norm_spectrum_2, norm_spectrum_1) assert score_1_2 == score_2_1, "Expected that the order of the arguments would not matter." assert n_matches_1_2 == n_matches_2_1, "Expected that the order of the arguments would not matter."
def test_modified_cosine_with_mass_shift(peaks, tolerance, masses, expected_matches): """Test modified cosine on two spectra with mass shift.""" builder = SpectrumBuilder() spectrum_1 = builder.with_mz(peaks[0][0]).with_intensities( peaks[0][1]).with_metadata(metadata={ "precursor_mz": masses[0] }).build() spectrum_2 = builder.with_mz(peaks[1][0]).with_intensities( peaks[1][1]).with_metadata(metadata={ "precursor_mz": masses[1] }).build() norm_spectrum_1 = normalize_intensities(spectrum_1) norm_spectrum_2 = normalize_intensities(spectrum_2) if tolerance is None: modified_cosine = ModifiedCosine() else: modified_cosine = ModifiedCosine(tolerance=tolerance) score = modified_cosine.pair(norm_spectrum_1, norm_spectrum_2) expected_score = compute_expected_score(norm_spectrum_1, norm_spectrum_2, expected_matches) assert score["score"] == pytest.approx( expected_score, 0.0001), "Expected different cosine score." assert score["matches"] == len( expected_matches), "Expected differnt number of matching peaks."
def test_modified_cosine_with_mass_shifted_and_unshifted_matches(): """Test modified cosine on two spectra with mass set shift. In this example 5 peak pairs are possible, but only 3 should be selected (every peak can only be counted once!)""" spectrum_1 = Spectrum(mz=numpy.array([100, 110, 200, 300, 400, 500, 600], dtype="float"), intensities=numpy.array([100, 50, 1, 80, 1, 1, 50], dtype="float"), metadata={"precursor_mz": 1000.0}) spectrum_2 = Spectrum(mz=numpy.array([110, 200, 300, 310, 700, 800], dtype="float"), intensities=numpy.array([100, 1, 90, 90, 1, 100], dtype="float"), metadata={"precursor_mz": 1010.0}) modified_cosine = ModifiedCosine() score = modified_cosine.pair(spectrum_1, spectrum_2) spec1 = spectrum_1.peaks.intensities spec2 = spectrum_2.peaks.intensities peak_pairs_multiplied = spec1[0] * spec2[0] + spec1[3] * spec2[3] + spec1[ 2] * spec2[1] expected_score = peak_pairs_multiplied / numpy.sqrt( numpy.sum(spec1**2) * numpy.sum(spec2**2)) assert score["score"] == pytest.approx( expected_score, 0.00001), "Expected different cosine score." assert score["matches"] == 3, "Expected 3 matching peaks."
def test_modified_cosine_without_precursor_mz(): """Test without precursor-m/z. Should raise assertion error.""" spectrum_1 = Spectrum(mz=numpy.array([100, 150, 200, 300, 500, 510, 1100], dtype="float"), intensities=numpy.array([700, 200, 100, 1000, 200, 5, 500], dtype="float")) spectrum_2 = Spectrum(mz=numpy.array([100, 140, 190, 300, 490, 510, 1090], dtype="float"), intensities=numpy.array([700, 200, 100, 1000, 200, 5, 500], dtype="float")) norm_spectrum_1 = normalize_intensities(spectrum_1) norm_spectrum_2 = normalize_intensities(spectrum_2) modified_cosine = ModifiedCosine() with pytest.raises(AssertionError) as msg: modified_cosine.pair(norm_spectrum_1, norm_spectrum_2) expected_message = "Precursor_mz missing. Apply 'add_precursor_mz' filter first." assert str(msg.value) == expected_message
def test_modified_cosine_with_mass_shift_5_no_matches_expected(): """Test modified cosine on two spectra with no expected matches.""" spectrum_1 = Spectrum(mz=numpy.array([100, 200, 300], dtype="float"), intensities=numpy.array([10, 10, 500], dtype="float"), metadata={"precursor_mz": 1000.0}) spectrum_2 = Spectrum(mz=numpy.array([120, 220, 320], dtype="float"), intensities=numpy.array([10, 10, 500], dtype="float"), metadata={"precursor_mz": 1005}) norm_spectrum_1 = normalize_intensities(spectrum_1) norm_spectrum_2 = normalize_intensities(spectrum_2) modified_cosine = ModifiedCosine(tolerance=1.0) score = modified_cosine.pair(norm_spectrum_1, norm_spectrum_2) assert score["score"] == pytest.approx(0.0, 1e-5), "Expected different modified cosine score." assert score["matches"] == 0, "Expected 0 matching peaks."
def test_modified_cosine_with_mass_shift_5_tolerance_2(): """Test modified cosine on two spectra with mass set shift and tolerance.""" spectrum_1 = Spectrum(mz=numpy.array([100, 200, 299, 300, 301, 500, 510], dtype="float"), intensities=numpy.array([10, 10, 500, 100, 200, 20, 100], dtype="float"), metadata={"precursor_mz": 1000.0}) spectrum_2 = Spectrum(mz=numpy.array([105, 205, 305, 306, 505, 517], dtype="float"), intensities=numpy.array([10, 10, 500, 100, 20, 100], dtype="float"), metadata={"precursor_mz": 1005}) norm_spectrum_1 = normalize_intensities(spectrum_1) norm_spectrum_2 = normalize_intensities(spectrum_2) modified_cosine = ModifiedCosine(tolerance=2.0) score = modified_cosine.pair(norm_spectrum_1, norm_spectrum_2) assert score["score"] == pytest.approx(0.96788, 0.0001), "Expected different modified cosine score." assert score["matches"] == 6, "Expected 6 matching peaks."
def test_modified_cosine_with_mass_shift_5(): """Test modified cosine on two spectra with mass set shift.""" spectrum_1 = Spectrum(mz=numpy.array([100, 150, 200, 300, 500, 510, 1100], dtype="float"), intensities=numpy.array([700, 200, 100, 1000, 200, 5, 500], dtype="float"), metadata={"precursor_mz": 1000.0}) spectrum_2 = Spectrum(mz=numpy.array([55, 105, 205, 304.5, 494.5, 515.5, 1045], dtype="float"), intensities=numpy.array([700, 200, 100, 1000, 200, 5, 500], dtype="float"), metadata={"precursor_mz": 1005.0}) norm_spectrum_1 = normalize_intensities(spectrum_1) norm_spectrum_2 = normalize_intensities(spectrum_2) modified_cosine = ModifiedCosine() score = modified_cosine.pair(norm_spectrum_1, norm_spectrum_2) assert score["score"] == pytest.approx(0.081966, 0.0001), "Expected different cosine score." assert score["matches"] == 2, "Expected 2 matching peaks."
def test_modified_cosine_precursor_mz_as_invalid_string(): """Test modified cosine on two spectra with precursor_mz given as string.""" spectrum_1 = Spectrum(mz=np.array([100, 200, 300], dtype="float"), intensities=np.array([10, 10, 500], dtype="float"), metadata={"precursor_mz": 1000.0}) spectrum_2 = Spectrum(mz=np.array([120, 220, 320], dtype="float"), intensities=np.array([10, 10, 500], dtype="float"), metadata={"precursor_mz": "mz 1005.0"}) norm_spectrum_1 = normalize_intensities(spectrum_1) norm_spectrum_2 = normalize_intensities(spectrum_2) modified_cosine = ModifiedCosine(tolerance=1.0) with pytest.raises(AssertionError) as msg: _ = modified_cosine.pair(norm_spectrum_1, norm_spectrum_2) expected_message = "Precursor_mz missing. Apply 'add_precursor_mz' filter first." assert str(msg.value) == expected_message
def test_modified_cosine_precursor_mz_as_string(caplog): """Test modified cosine on two spectra with precursor_mz given as string.""" spectrum_1 = Spectrum(mz=np.array([100, 200, 300], dtype="float"), intensities=np.array([10, 10, 500], dtype="float"), metadata={"precursor_mz": 1000.0}, metadata_harmonization=False) spectrum_2 = Spectrum(mz=np.array([120, 220, 320], dtype="float"), intensities=np.array([10, 10, 500], dtype="float"), metadata={"precursor_mz": "1005.0"}, metadata_harmonization=False) norm_spectrum_1 = normalize_intensities(spectrum_1) norm_spectrum_2 = normalize_intensities(spectrum_2) modified_cosine = ModifiedCosine(tolerance=1.0) score = modified_cosine.pair(norm_spectrum_1, norm_spectrum_2) assert score["score"] == pytest.approx( 0.0, 1e-5), "Expected different modified cosine score." assert score["matches"] == 0, "Expected 0 matching peaks." expected_msg = "Precursor_mz must be of type int or float. Apply 'add_precursor_mz' filter first." assert expected_msg in caplog.text, "Expected different log message"
def library_matching( documents_query: List[SpectrumDocument], documents_library: List[SpectrumDocument], model: BaseTopicModel, presearch_based_on: List[str] = ["precursor_mz", "spec2vec-top10"], ignore_non_annotated: bool = True, include_scores=["spec2vec", "cosine", "modcosine"], intensity_weighting_power: float = 0.5, allowed_missing_percentage: float = 0, cosine_tol: float = 0.005, min_matches: int = 6, mass_tolerance: float = 2.0, mass_tolerance_type: str = "ppm"): """Selecting potential spectra matches with spectra library. Suitable candidates will be selected by 1) top_n Spec2Vec similarity, and 2) same precursor mass (within given mz_ppm tolerance(s)). For later matching routines, additional scores (cosine, modified cosine) are added as well. Args: -------- documents_query: List containing all spectrum documents that should be queried against the library. documents_library: List containing all library spectrum documents. model: Pretrained word2Vec model. presearch_based_on: List with strings to specify which measures to use for the presearch. This can include 'precursor_mz', 'spec2vec-topX', ignore_non_annotated: bool, optional If True, only annotated spectra will be considered for matching. Default = True. cosine_tol: float, optional Set tolerance for the cosine and modified cosine score. Default = 0.005 mass_tolerance Specify tolerance for a mass match. mass_toleramce_type Chose between "ppm" (relative) and "Dalton" (absolute) tolerance type. """ # Initializations found_matches = [] m_mass_matches = None m_spec2vec_similarities = None m_modcos_similarities = None def get_metadata(documents): metadata = [] for doc in documents: metadata.append(doc._obj.get("smiles")) return metadata library_spectra_metadata = get_metadata(documents_library) if ignore_non_annotated: # Get array of all ids for spectra with smiles library_ids = np.asarray( [i for i, x in enumerate(library_spectra_metadata) if x]) else: library_ids = np.arange(len(documents_library)) allowed_presearch_type = ["precursor_mz", "spec2vec-top", "modcos-top"] msg = "Presearch must include one of: " + ", ".join(allowed_presearch_type) assert np.any([(x in y) for x in allowed_presearch_type for y in presearch_based_on]), msg # 1. Search for top-n Spec2Vec matches ------------------------------------ if np.any(["spec2vec" in x for x in presearch_based_on]): top_n = int([ x.split("top")[1] for x in presearch_based_on if "spec2vec" in x ][0]) print(f"Pre-selection includes spec2vec top {top_n}.") spec2vec = Spec2Vec( model=model, intensity_weighting_power=intensity_weighting_power, allowed_missing_percentage=allowed_missing_percentage, progress_bar=True) m_spec2vec_similarities = spec2vec.matrix( [documents_library[i] for i in library_ids], documents_query) # Select top_n similarity values: selection_spec2vec = np.argpartition(m_spec2vec_similarities, -top_n, axis=0)[-top_n:, :] else: selection_spec2vec = np.empty((0, len(documents_query)), dtype="int") # 2. Search for precursor_mz based matches --------------------------------- if "precursor_mz" in presearch_based_on: print( f"Pre-selection includes mass matches within {mass_tolerance} {mass_tolerance_type}." ) mass_matching = PrecursorMzMatch(tolerance=mass_tolerance, tolerance_type=mass_tolerance_type) m_mass_matches = mass_matching.matrix( [documents_library[i]._obj for i in library_ids], [x._obj for x in documents_query]) selection_massmatch = [] for i in range(len(documents_query)): selection_massmatch.append(np.where(m_mass_matches[:, i] == 1)[0]) else: selection_massmatch = np.empty((len(documents_query), 0), dtype="int") # 3. Search for top-n modified cosine matches ------------------------------------ if np.any(["modcos" in x for x in presearch_based_on]): top_n = int([ x.split("top")[1] for x in presearch_based_on if "modcos" in x ][0]) print(f"Pre-selection includes modified cosine top {top_n}.") modcos = ModifiedCosine(tolerance=cosine_tol) n_rows = len(library_ids) n_cols = len(documents_query) m_modcos_similarities = np.zeros([n_rows, n_cols], dtype=np.float64) m_modcos_matches = np.zeros([n_rows, n_cols], dtype=np.float64) for i_ref, reference in enumerate( tqdm([documents_library[i]._obj for i in library_ids])): for i_query, query in enumerate([x._obj for x in documents_query]): score = modcos.pair(reference, query) m_modcos_similarities[i_ref][i_query] = score[0] m_modcos_matches[i_ref][i_query] = score[1] # Select top_n similarity values: m_modcos_selected = m_modcos_similarities.copy() m_modcos_selected[m_modcos_matches < min_matches] = 0 selection_modcos = np.argpartition(m_modcos_selected, -top_n, axis=0)[-top_n:, :] else: selection_modcos = np.empty((0, len(documents_query)), dtype="int") # 4. Combine found matches ------------------------------------------------ if "cosine" in include_scores: print("Calculate cosine score for selected candidates.") if "modcosine" in include_scores: print("Calculate modified cosine score for selected candidates.") for i in tqdm(range(len(documents_query))): s2v_top_ids = selection_spec2vec[:, i] mass_match_ids = selection_massmatch[i] modcos_ids = selection_modcos[:, i] all_match_ids = np.unique( np.concatenate((s2v_top_ids, mass_match_ids, modcos_ids))) if len(all_match_ids) > 0: if "cosine" in include_scores: # Get cosine score for found matches cosine_similarity = CosineGreedy(tolerance=cosine_tol) cosine_scores = [] for match_id in library_ids[all_match_ids]: cosine_scores.append( cosine_similarity.pair( documents_library[match_id]._obj, documents_query[i]._obj)) else: cosine_scores = len(all_match_ids) * ["not calculated"] if m_modcos_similarities is not None: mod_cosine_scores0 = [ x for x in m_modcos_similarities[all_match_ids, i] ] mod_cosine_scores1 = [ x for x in m_modcos_matches[all_match_ids, i] ] mod_cosine_scores = list( zip(mod_cosine_scores0, mod_cosine_scores1)) elif "modcosine" in include_scores: # Get modified cosine score for found matches mod_cosine_similarity = ModifiedCosine(tolerance=cosine_tol) mod_cosine_scores = [] for match_id in library_ids[all_match_ids]: mod_cosine_scores.append( mod_cosine_similarity.pair( documents_library[match_id]._obj, documents_query[i]._obj)) else: mod_cosine_scores = len(all_match_ids) * ["not calculated"] matches_df = pd.DataFrame( { "cosine_score": [x["score"] for x in cosine_scores], "cosine_matches": [x["matches"] for x in cosine_scores], "mod_cosine_score": [x["score"] for x in mod_cosine_scores], "mod_cosine_matches": [x["matches"] for x in mod_cosine_scores] }, index=library_ids[all_match_ids]) if m_mass_matches is not None: matches_df["mass_match"] = m_mass_matches[all_match_ids, i] if m_spec2vec_similarities is not None: matches_df["s2v_score"] = m_spec2vec_similarities[ all_match_ids, i] elif "spec2vec" in include_scores: spec2vec_similarity = Spec2Vec( model=model, intensity_weighting_power=intensity_weighting_power, allowed_missing_percentage=allowed_missing_percentage) spec2vec_scores = [] for match_id in library_ids[all_match_ids]: spec2vec_scores.append( spec2vec_similarity.pair(documents_library[match_id], documents_query[i])) matches_df["s2v_score"] = spec2vec_scores found_matches.append(matches_df.fillna(0)) else: found_matches.append([]) return found_matches