def test_cosine_score_greedy_with_tolerance_2_0(): """Compare output cosine score for tolerance 2.0 with own calculation on simple dummy spectrums.""" spectrum_1 = Spectrum(mz=numpy.array([100, 299, 300, 301, 510], dtype="float"), intensities=numpy.array([0.1, 1.0, 0.2, 0.3, 0.4], dtype="float")) spectrum_2 = Spectrum(mz=numpy.array([100, 300, 301, 511], dtype="float"), intensities=numpy.array([0.1, 1.0, 0.3, 0.4], dtype="float")) cosine_greedy = CosineGreedy(tolerance=2.0) score, n_matches = cosine_greedy.pair(spectrum_1, spectrum_2) # Derive expected cosine score expected_matches = [[0, 1, 3, 4], [ 0, 1, 2, 3 ]] # Those peaks have matching mz values (within given tolerance) multiply_matching_intensities = spectrum_1.peaks.intensities[expected_matches[0]] \ * spectrum_2.peaks.intensities[expected_matches[1]] denominator = numpy.sqrt((spectrum_1.peaks.intensities ** 2).sum()) \ * numpy.sqrt((spectrum_2.peaks.intensities ** 2).sum()) expected_score = multiply_matching_intensities.sum() / denominator assert score == pytest.approx(expected_score, 0.0001), "Expected different cosine score." assert n_matches == len( expected_matches[0]), "Expected different number of matching peaks."
def test_cosine_greedy_with_peak_powers(): """Compare output cosine score with own calculation on simple dummy spectrums. Here testing the options to raise peak intensities to given powers. """ mz_power = 0.5 intensity_power = 2.0 spectrum_1 = Spectrum(mz=numpy.array([100, 200, 300, 500, 510], dtype="float"), intensities=numpy.array([0.1, 0.2, 1.0, 0.3, 0.4], dtype="float")) spectrum_2 = Spectrum(mz=numpy.array([100, 200, 290, 490, 510], dtype="float"), intensities=numpy.array([0.1, 0.2, 1.0, 0.3, 0.4], dtype="float")) cosine_greedy = CosineGreedy(tolerance=1.0, mz_power=mz_power, intensity_power=intensity_power) score = cosine_greedy.pair(spectrum_1, spectrum_2) # Derive expected cosine score matches = [0, 1, 4] # Those peaks have matching mz values (within given tolerance) intensity1 = spectrum_1.peaks.intensities mz1 = spectrum_1.peaks.mz intensity2 = spectrum_2.peaks.intensities mz2 = spectrum_2.peaks.mz multiply_matching_intensities = (mz1[matches] ** mz_power) * (intensity1[matches] ** intensity_power) \ * (mz2[matches] ** mz_power) * (intensity2[matches] ** intensity_power) denominator = numpy.sqrt((((mz1 ** mz_power) * (intensity1 ** intensity_power)) ** 2).sum()) \ * numpy.sqrt((((mz2 ** mz_power) * (intensity2 ** intensity_power)) ** 2).sum()) expected_score = multiply_matching_intensities.sum() / denominator assert score["score"] == pytest.approx(expected_score, 0.0001), "Expected different cosine score." assert score["matches"] == len(matches), "Expected different number of matching peaks."
def get_hits(query_spec, library_spec, precursor_tol=1, metaKey='parent_mass', cosine_tol=0.1, decoys=False, passatutto=False, min_match_count=6): cosine = CosineGreedy(tolerance=cosine_tol) library_spec.sort(key=lambda x: getMeta(x)[metaKey]) hits = [] library_prec_list = [getMeta(x)[metaKey] for x in library_spec] for q_idx, q in enumerate(query_spec): if metaKey not in getMeta(q): continue min_mz = getMeta(q)[metaKey] - precursor_tol max_mz = getMeta(q)[metaKey] + precursor_tol pos = bisect.bisect_right(library_prec_list, min_mz) pos2 = pos while pos2 < len( library_prec_list) and library_prec_list[pos2] < max_mz: pos2 += 1 # nothing in precursor range if pos == pos2: continue scores = [] for l_idx in range(pos, pos2): l = library_spec[l_idx] score, match_count = cosine.pair(q, l).item() if score != score: print('got nan for', q.get('compound_name'), l.get('compound_name')) continue if match_count >= min_match_count: scores.append((score, l)) scores.sort(key=lambda x: x[0], reverse=True) if scores: score, target = scores[0] if decoys: hits.append(Hit(q, target, score, 'decoy')) else: if passatutto: hits.append( Hit(q, target, score, passatutto_inchis_equal(q, target))) else: hits.append(Hit(q, target, score, inchis_equal(q, target))) return hits
def test_cosine_score_greedy_order_of_arguments(): """Compare cosine scores for A,B versus B,A, which should give the same score.""" spectrum_1 = Spectrum(mz=numpy.array([100, 200, 299, 300, 301, 500, 510], dtype="float"), intensities=numpy.array([0.02, 0.02, 1.0, 0.2, 0.4, 0.04, 0.2], dtype="float"), metadata=dict()) spectrum_2 = Spectrum(mz=numpy.array([100, 200, 300, 301, 500, 512], dtype="float"), intensities=numpy.array([0.02, 0.02, 1.0, 0.2, 0.04, 0.2], dtype="float"), metadata=dict()) cosine_greedy = CosineGreedy(tolerance=2.0) score_1_2 = cosine_greedy.pair(spectrum_1, spectrum_2) score_2_1 = cosine_greedy.pair(spectrum_2, spectrum_1) assert score_1_2["score"] == score_2_1["score"], "Expected that the order of the arguments would not matter." assert score_1_2 == score_2_1, "Expected that the order of the arguments would not matter."
def test_cosine_greedy_without_parameters(): """Compare output cosine score with own calculation on simple dummy spectrums.""" spectrum_1 = Spectrum(mz=numpy.array([100, 200, 300, 500, 510], dtype="float"), intensities=numpy.array([0.1, 0.2, 1.0, 0.3, 0.4], dtype="float")) spectrum_2 = Spectrum(mz=numpy.array([100, 200, 290, 490, 510], dtype="float"), intensities=numpy.array([0.1, 0.2, 1.0, 0.3, 0.4], dtype="float")) cosine_greedy = CosineGreedy() score = cosine_greedy.pair(spectrum_1, spectrum_2) # Derive expected cosine score expected_matches = [0, 1, 4] # Those peaks have matching mz values (within given tolerance) multiply_matching_intensities = spectrum_1.peaks.intensities[expected_matches] \ * spectrum_2.peaks.intensities[expected_matches] denominator = numpy.sqrt((spectrum_1.peaks.intensities ** 2).sum()) \ * numpy.sqrt((spectrum_2.peaks.intensities ** 2).sum()) expected_score = multiply_matching_intensities.sum() / denominator assert score["score"] == pytest.approx(expected_score, 0.0001), "Expected different cosine score." assert score["matches"] == len(expected_matches), "Expected different number of matching peaks."
def test_cosine_greedy_pair(peaks, tolerance, mz_power, intensity_power, expected_matches): builder = SpectrumBuilder() spectrum_1 = builder.with_mz(peaks[0][0]).with_intensities( peaks[0][1]).build() spectrum_2 = builder.with_mz(peaks[1][0]).with_intensities( peaks[1][1]).build() cosine_greedy = CosineGreedy(tolerance=tolerance, mz_power=mz_power, intensity_power=intensity_power) score = cosine_greedy.pair(spectrum_1, spectrum_2) expected_score = compute_expected_score(mz_power, intensity_power, spectrum_1, spectrum_2, expected_matches) assert score["score"] == pytest.approx( expected_score, 0.0001), "Expected different cosine score." assert score["matches"] == len( expected_matches[0]), "Expected different number of matching peaks."
def library_matching( documents_query: List[SpectrumDocument], documents_library: List[SpectrumDocument], model: BaseTopicModel, presearch_based_on: List[str] = ["precursor_mz", "spec2vec-top10"], ignore_non_annotated: bool = True, include_scores=["spec2vec", "cosine", "modcosine"], intensity_weighting_power: float = 0.5, allowed_missing_percentage: float = 0, cosine_tol: float = 0.005, min_matches: int = 6, mass_tolerance: float = 2.0, mass_tolerance_type: str = "ppm"): """Selecting potential spectra matches with spectra library. Suitable candidates will be selected by 1) top_n Spec2Vec similarity, and 2) same precursor mass (within given mz_ppm tolerance(s)). For later matching routines, additional scores (cosine, modified cosine) are added as well. Args: -------- documents_query: List containing all spectrum documents that should be queried against the library. documents_library: List containing all library spectrum documents. model: Pretrained word2Vec model. presearch_based_on: List with strings to specify which measures to use for the presearch. This can include 'precursor_mz', 'spec2vec-topX', ignore_non_annotated: bool, optional If True, only annotated spectra will be considered for matching. Default = True. cosine_tol: float, optional Set tolerance for the cosine and modified cosine score. Default = 0.005 mass_tolerance Specify tolerance for a mass match. mass_toleramce_type Chose between "ppm" (relative) and "Dalton" (absolute) tolerance type. """ # Initializations found_matches = [] m_mass_matches = None m_spec2vec_similarities = None m_modcos_similarities = None def get_metadata(documents): metadata = [] for doc in documents: metadata.append(doc._obj.get("smiles")) return metadata library_spectra_metadata = get_metadata(documents_library) if ignore_non_annotated: # Get array of all ids for spectra with smiles library_ids = np.asarray( [i for i, x in enumerate(library_spectra_metadata) if x]) else: library_ids = np.arange(len(documents_library)) allowed_presearch_type = ["precursor_mz", "spec2vec-top", "modcos-top"] msg = "Presearch must include one of: " + ", ".join(allowed_presearch_type) assert np.any([(x in y) for x in allowed_presearch_type for y in presearch_based_on]), msg # 1. Search for top-n Spec2Vec matches ------------------------------------ if np.any(["spec2vec" in x for x in presearch_based_on]): top_n = int([ x.split("top")[1] for x in presearch_based_on if "spec2vec" in x ][0]) print(f"Pre-selection includes spec2vec top {top_n}.") spec2vec = Spec2Vec( model=model, intensity_weighting_power=intensity_weighting_power, allowed_missing_percentage=allowed_missing_percentage, progress_bar=True) m_spec2vec_similarities = spec2vec.matrix( [documents_library[i] for i in library_ids], documents_query) # Select top_n similarity values: selection_spec2vec = np.argpartition(m_spec2vec_similarities, -top_n, axis=0)[-top_n:, :] else: selection_spec2vec = np.empty((0, len(documents_query)), dtype="int") # 2. Search for precursor_mz based matches --------------------------------- if "precursor_mz" in presearch_based_on: print( f"Pre-selection includes mass matches within {mass_tolerance} {mass_tolerance_type}." ) mass_matching = PrecursorMzMatch(tolerance=mass_tolerance, tolerance_type=mass_tolerance_type) m_mass_matches = mass_matching.matrix( [documents_library[i]._obj for i in library_ids], [x._obj for x in documents_query]) selection_massmatch = [] for i in range(len(documents_query)): selection_massmatch.append(np.where(m_mass_matches[:, i] == 1)[0]) else: selection_massmatch = np.empty((len(documents_query), 0), dtype="int") # 3. Search for top-n modified cosine matches ------------------------------------ if np.any(["modcos" in x for x in presearch_based_on]): top_n = int([ x.split("top")[1] for x in presearch_based_on if "modcos" in x ][0]) print(f"Pre-selection includes modified cosine top {top_n}.") modcos = ModifiedCosine(tolerance=cosine_tol) n_rows = len(library_ids) n_cols = len(documents_query) m_modcos_similarities = np.zeros([n_rows, n_cols], dtype=np.float64) m_modcos_matches = np.zeros([n_rows, n_cols], dtype=np.float64) for i_ref, reference in enumerate( tqdm([documents_library[i]._obj for i in library_ids])): for i_query, query in enumerate([x._obj for x in documents_query]): score = modcos.pair(reference, query) m_modcos_similarities[i_ref][i_query] = score[0] m_modcos_matches[i_ref][i_query] = score[1] # Select top_n similarity values: m_modcos_selected = m_modcos_similarities.copy() m_modcos_selected[m_modcos_matches < min_matches] = 0 selection_modcos = np.argpartition(m_modcos_selected, -top_n, axis=0)[-top_n:, :] else: selection_modcos = np.empty((0, len(documents_query)), dtype="int") # 4. Combine found matches ------------------------------------------------ if "cosine" in include_scores: print("Calculate cosine score for selected candidates.") if "modcosine" in include_scores: print("Calculate modified cosine score for selected candidates.") for i in tqdm(range(len(documents_query))): s2v_top_ids = selection_spec2vec[:, i] mass_match_ids = selection_massmatch[i] modcos_ids = selection_modcos[:, i] all_match_ids = np.unique( np.concatenate((s2v_top_ids, mass_match_ids, modcos_ids))) if len(all_match_ids) > 0: if "cosine" in include_scores: # Get cosine score for found matches cosine_similarity = CosineGreedy(tolerance=cosine_tol) cosine_scores = [] for match_id in library_ids[all_match_ids]: cosine_scores.append( cosine_similarity.pair( documents_library[match_id]._obj, documents_query[i]._obj)) else: cosine_scores = len(all_match_ids) * ["not calculated"] if m_modcos_similarities is not None: mod_cosine_scores0 = [ x for x in m_modcos_similarities[all_match_ids, i] ] mod_cosine_scores1 = [ x for x in m_modcos_matches[all_match_ids, i] ] mod_cosine_scores = list( zip(mod_cosine_scores0, mod_cosine_scores1)) elif "modcosine" in include_scores: # Get modified cosine score for found matches mod_cosine_similarity = ModifiedCosine(tolerance=cosine_tol) mod_cosine_scores = [] for match_id in library_ids[all_match_ids]: mod_cosine_scores.append( mod_cosine_similarity.pair( documents_library[match_id]._obj, documents_query[i]._obj)) else: mod_cosine_scores = len(all_match_ids) * ["not calculated"] matches_df = pd.DataFrame( { "cosine_score": [x["score"] for x in cosine_scores], "cosine_matches": [x["matches"] for x in cosine_scores], "mod_cosine_score": [x["score"] for x in mod_cosine_scores], "mod_cosine_matches": [x["matches"] for x in mod_cosine_scores] }, index=library_ids[all_match_ids]) if m_mass_matches is not None: matches_df["mass_match"] = m_mass_matches[all_match_ids, i] if m_spec2vec_similarities is not None: matches_df["s2v_score"] = m_spec2vec_similarities[ all_match_ids, i] elif "spec2vec" in include_scores: spec2vec_similarity = Spec2Vec( model=model, intensity_weighting_power=intensity_weighting_power, allowed_missing_percentage=allowed_missing_percentage) spec2vec_scores = [] for match_id in library_ids[all_match_ids]: spec2vec_scores.append( spec2vec_similarity.pair(documents_library[match_id], documents_query[i])) matches_df["s2v_score"] = spec2vec_scores found_matches.append(matches_df.fillna(0)) else: found_matches.append([]) return found_matches