def test_cosine_greedy_with_peak_powers(): """Compare output cosine score with own calculation on simple dummy spectrums. Here testing the options to raise peak intensities to given powers. """ mz_power = 0.5 intensity_power = 2.0 spectrum_1 = Spectrum(mz=numpy.array([100, 200, 300, 500, 510], dtype="float"), intensities=numpy.array([0.1, 0.2, 1.0, 0.3, 0.4], dtype="float")) spectrum_2 = Spectrum(mz=numpy.array([100, 200, 290, 490, 510], dtype="float"), intensities=numpy.array([0.1, 0.2, 1.0, 0.3, 0.4], dtype="float")) cosine_greedy = CosineGreedy(tolerance=1.0, mz_power=mz_power, intensity_power=intensity_power) score = cosine_greedy.pair(spectrum_1, spectrum_2) # Derive expected cosine score matches = [0, 1, 4] # Those peaks have matching mz values (within given tolerance) intensity1 = spectrum_1.peaks.intensities mz1 = spectrum_1.peaks.mz intensity2 = spectrum_2.peaks.intensities mz2 = spectrum_2.peaks.mz multiply_matching_intensities = (mz1[matches] ** mz_power) * (intensity1[matches] ** intensity_power) \ * (mz2[matches] ** mz_power) * (intensity2[matches] ** intensity_power) denominator = numpy.sqrt((((mz1 ** mz_power) * (intensity1 ** intensity_power)) ** 2).sum()) \ * numpy.sqrt((((mz2 ** mz_power) * (intensity2 ** intensity_power)) ** 2).sum()) expected_score = multiply_matching_intensities.sum() / denominator assert score["score"] == pytest.approx(expected_score, 0.0001), "Expected different cosine score." assert score["matches"] == len(matches), "Expected different number of matching peaks."
def test_cosine_score_greedy_with_tolerance_2_0(): """Compare output cosine score for tolerance 2.0 with own calculation on simple dummy spectrums.""" spectrum_1 = Spectrum(mz=numpy.array([100, 299, 300, 301, 510], dtype="float"), intensities=numpy.array([0.1, 1.0, 0.2, 0.3, 0.4], dtype="float")) spectrum_2 = Spectrum(mz=numpy.array([100, 300, 301, 511], dtype="float"), intensities=numpy.array([0.1, 1.0, 0.3, 0.4], dtype="float")) cosine_greedy = CosineGreedy(tolerance=2.0) score, n_matches = cosine_greedy.pair(spectrum_1, spectrum_2) # Derive expected cosine score expected_matches = [[0, 1, 3, 4], [ 0, 1, 2, 3 ]] # Those peaks have matching mz values (within given tolerance) multiply_matching_intensities = spectrum_1.peaks.intensities[expected_matches[0]] \ * spectrum_2.peaks.intensities[expected_matches[1]] denominator = numpy.sqrt((spectrum_1.peaks.intensities ** 2).sum()) \ * numpy.sqrt((spectrum_2.peaks.intensities ** 2).sum()) expected_score = multiply_matching_intensities.sum() / denominator assert score == pytest.approx(expected_score, 0.0001), "Expected different cosine score." assert n_matches == len( expected_matches[0]), "Expected different number of matching peaks."
def test_cosine_greedy_with_arrays_symmetric(): """Test if matrix with is_symmetric=True works properly.""" spectrum_1 = Spectrum(mz=numpy.array([100, 200, 300], dtype="float"), intensities=numpy.array([0.1, 0.2, 1.0], dtype="float")) spectrum_2 = Spectrum(mz=numpy.array([110, 190, 290], dtype="float"), intensities=numpy.array([0.5, 0.2, 1.0], dtype="float")) spectrums = [spectrum_1, spectrum_2] cosine_greedy = CosineGreedy() scores = cosine_greedy.matrix(spectrums, spectrums, is_symmetric=True) assert scores[0][0][0] == pytest.approx(scores[1][1][0], 0.000001), "Expected different cosine score." assert scores[0][1][0] == pytest.approx(scores[1][0][0], 0.000001), "Expected different cosine score."
def get_hits(query_spec, library_spec, precursor_tol=1, metaKey='parent_mass', cosine_tol=0.1, decoys=False, passatutto=False, min_match_count=6): cosine = CosineGreedy(tolerance=cosine_tol) library_spec.sort(key=lambda x: getMeta(x)[metaKey]) hits = [] library_prec_list = [getMeta(x)[metaKey] for x in library_spec] for q_idx, q in enumerate(query_spec): if metaKey not in getMeta(q): continue min_mz = getMeta(q)[metaKey] - precursor_tol max_mz = getMeta(q)[metaKey] + precursor_tol pos = bisect.bisect_right(library_prec_list, min_mz) pos2 = pos while pos2 < len( library_prec_list) and library_prec_list[pos2] < max_mz: pos2 += 1 # nothing in precursor range if pos == pos2: continue scores = [] for l_idx in range(pos, pos2): l = library_spec[l_idx] score, match_count = cosine.pair(q, l).item() if score != score: print('got nan for', q.get('compound_name'), l.get('compound_name')) continue if match_count >= min_match_count: scores.append((score, l)) scores.sort(key=lambda x: x[0], reverse=True) if scores: score, target = scores[0] if decoys: hits.append(Hit(q, target, score, 'decoy')) else: if passatutto: hits.append( Hit(q, target, score, passatutto_inchis_equal(q, target))) else: hits.append(Hit(q, target, score, inchis_equal(q, target))) return hits
def test_cosine_score_greedy_order_of_arguments(): """Compare cosine scores for A,B versus B,A, which should give the same score.""" spectrum_1 = Spectrum(mz=numpy.array([100, 200, 299, 300, 301, 500, 510], dtype="float"), intensities=numpy.array([0.02, 0.02, 1.0, 0.2, 0.4, 0.04, 0.2], dtype="float"), metadata=dict()) spectrum_2 = Spectrum(mz=numpy.array([100, 200, 300, 301, 500, 512], dtype="float"), intensities=numpy.array([0.02, 0.02, 1.0, 0.2, 0.04, 0.2], dtype="float"), metadata=dict()) cosine_greedy = CosineGreedy(tolerance=2.0) score_1_2 = cosine_greedy.pair(spectrum_1, spectrum_2) score_2_1 = cosine_greedy.pair(spectrum_2, spectrum_1) assert score_1_2["score"] == score_2_1["score"], "Expected that the order of the arguments would not matter." assert score_1_2 == score_2_1, "Expected that the order of the arguments would not matter."
def test_scores_by_reference_sorted(): "Test scores_by_reference method with sort=True." spectrum_1 = Spectrum(mz=numpy.array([100, 150, 200.]), intensities=numpy.array([0.7, 0.2, 0.1]), metadata={'id': 'spectrum1'}) spectrum_2 = Spectrum(mz=numpy.array([100, 140, 190.]), intensities=numpy.array([0.4, 0.2, 0.1]), metadata={'id': 'spectrum2'}) spectrum_3 = Spectrum(mz=numpy.array([110, 140, 195.]), intensities=numpy.array([0.6, 0.2, 0.1]), metadata={'id': 'spectrum3'}) spectrum_4 = Spectrum(mz=numpy.array([100, 150, 200.]), intensities=numpy.array([0.6, 0.1, 0.6]), metadata={'id': 'spectrum4'}) references = [spectrum_1, spectrum_2, spectrum_3] queries = [spectrum_3, spectrum_4, spectrum_2] scores = calculate_scores(references, queries, CosineGreedy()) selected_scores = scores.scores_by_reference(spectrum_2, sort=True) expected_result = [(scores.queries[i], scores.scores[1, i]) for i in [2, 1, 0]] assert selected_scores == expected_result, "Expected different scores." scores_only = numpy.array([x[1]["score"] for x in selected_scores]) scores_expected = numpy.array( [1.0, 0.6129713330865563, 0.1363196353181994]) assert numpy.allclose(scores_only, scores_expected, atol=1e-8), \ "Expected different sorted scores."
def test_cosine_greedy_without_parameters(): """Compare output cosine score with own calculation on simple dummy spectrums.""" spectrum_1 = Spectrum(mz=numpy.array([100, 200, 300, 500, 510], dtype="float"), intensities=numpy.array([0.1, 0.2, 1.0, 0.3, 0.4], dtype="float")) spectrum_2 = Spectrum(mz=numpy.array([100, 200, 290, 490, 510], dtype="float"), intensities=numpy.array([0.1, 0.2, 1.0, 0.3, 0.4], dtype="float")) cosine_greedy = CosineGreedy() score = cosine_greedy.pair(spectrum_1, spectrum_2) # Derive expected cosine score expected_matches = [0, 1, 4] # Those peaks have matching mz values (within given tolerance) multiply_matching_intensities = spectrum_1.peaks.intensities[expected_matches] \ * spectrum_2.peaks.intensities[expected_matches] denominator = numpy.sqrt((spectrum_1.peaks.intensities ** 2).sum()) \ * numpy.sqrt((spectrum_2.peaks.intensities ** 2).sum()) expected_score = multiply_matching_intensities.sum() / denominator assert score["score"] == pytest.approx(expected_score, 0.0001), "Expected different cosine score." assert score["matches"] == len(expected_matches), "Expected different number of matching peaks."
def test_user_workflow(): def apply_my_filters(s): s = default_filters(s) s = add_parent_mass(s) s = normalize_intensities(s) s = select_by_relative_intensity(s, intensity_from=0.0, intensity_to=1.0) s = select_by_mz(s, mz_from=0, mz_to=1000) s = require_minimum_number_of_peaks(s, n_required=5) return s module_root = os.path.join(os.path.dirname(__file__), "..") spectrums_file = os.path.join(module_root, "tests", "pesticides.mgf") # apply my filters to the data spectrums = [apply_my_filters(s) for s in load_from_mgf(spectrums_file)] # omit spectrums that didn't qualify for analysis spectrums = [s for s in spectrums if s is not None] # this will be a library grouping analysis, so queries = references = spectrums queries = spectrums[:] references = spectrums[:] # define similarity function cosine_greedy = CosineGreedy(tolerance=0.3) # calculate_scores scores = list(calculate_scores(references, queries, cosine_greedy)) # filter out self-comparisons, require at least 20 matching peaks: filtered = [(reference, query, score, n_matching) for (reference, query, score, n_matching) in scores if reference != query and n_matching >= 20] sorted_by_score = sorted(filtered, key=lambda elem: elem[2], reverse=True) actual_top10 = sorted_by_score[:10] expected_top10 = [ (references[48], queries[50], pytest.approx(0.9994783627790965, rel=1e-9), 25), (references[50], queries[48], pytest.approx(0.9994783627790965, rel=1e-9), 25), (references[46], queries[48], pytest.approx(0.9990141860269471, rel=1e-9), 27), (references[48], queries[46], pytest.approx(0.9990141860269471, rel=1e-9), 27), (references[46], queries[50], pytest.approx(0.9988793406908719, rel=1e-9), 22), (references[50], queries[46], pytest.approx(0.9988793406908719, rel=1e-9), 22), (references[57], queries[59], pytest.approx(0.9982171275552505, rel=1e-9), 46), (references[59], queries[57], pytest.approx(0.9982171275552505, rel=1e-9), 46), (references[73], queries[74], pytest.approx(0.9973823244169199, rel=1e-9), 23), (references[74], queries[73], pytest.approx(0.9973823244169199, rel=1e-9), 23), ] assert actual_top10 == expected_top10
def test_scores_by_query(): "Test scores_by_query method." spectrum_1, spectrum_2, spectrum_3, spectrum_4 = spectra() references = [spectrum_1, spectrum_2, spectrum_3] queries = [spectrum_2, spectrum_3, spectrum_4] scores = calculate_scores(references, queries, CosineGreedy()) selected_scores = scores.scores_by_query(spectrum_4) expected_result = [(scores.references[i], scores.scores[i, 2]) for i in range(3)] assert selected_scores == expected_result, "Expected different scores."
def test_cosine_greedy_pair(peaks, tolerance, mz_power, intensity_power, expected_matches): builder = SpectrumBuilder() spectrum_1 = builder.with_mz(peaks[0][0]).with_intensities( peaks[0][1]).build() spectrum_2 = builder.with_mz(peaks[1][0]).with_intensities( peaks[1][1]).build() cosine_greedy = CosineGreedy(tolerance=tolerance, mz_power=mz_power, intensity_power=intensity_power) score = cosine_greedy.pair(spectrum_1, spectrum_2) expected_score = compute_expected_score(mz_power, intensity_power, spectrum_1, spectrum_2, expected_matches) assert score["score"] == pytest.approx( expected_score, 0.0001), "Expected different cosine score." assert score["matches"] == len( expected_matches[0]), "Expected different number of matching peaks."
def test_cosine_greedy_matrix(symmetric): builder = SpectrumBuilder() spectrum_1 = builder.with_mz(numpy.array( [100, 200, 300], dtype="float")).with_intensities( numpy.array([0.1, 0.2, 1.0], dtype="float")).build() spectrum_2 = builder.with_mz(numpy.array( [110, 190, 290], dtype="float")).with_intensities( numpy.array([0.5, 0.2, 1.0], dtype="float")).build() spectrums = [spectrum_1, spectrum_2] cosine_greedy = CosineGreedy() scores = cosine_greedy.matrix(spectrums, spectrums, is_symmetric=symmetric) assert scores[0][0][0] == pytest.approx( scores[1][1][0], 0.000001), "Expected different cosine score." assert scores[0][0]["score"] == pytest.approx(scores[1][1]["score"], 0.000001), \ "Expected different cosine score." assert scores[0][1][0] == pytest.approx( scores[1][0][0], 0.000001), "Expected different cosine score." assert scores[0][1]["score"] == pytest.approx(scores[1][0]["score"], 0.000001), \ "Expected different cosine score."
def return_list_cosine_scores(query, library, type): if(type != "library" and type != "decoy"): print("library type parameter must be either library or decoy") return False else: cosine_greedy = CosineGreedy(tolerance=0.2) counter = 1 scores = [] average_matches = 0 milestone = 1 if(type == "decoy"): for spec in query: prelim_scores = [] for d in library: score, n_matches = cosine_greedy(d, spec) average_matches = average_matches + n_matches newscore = CosineHit(score, type, spec, d) prelim_scores.append(newscore) prelim_scores = sorted(prelim_scores) scores.append(prelim_scores[-1]) if(type == "library"): for spec in query: prelim_scores = [] for d in library: if(are_peaks_similar(spec.metadata['precursor_mz'], d.metadata['precursor_mz']) == True): score, n_matches = cosine_greedy(d, spec) average_matches = average_matches + n_matches newscore = CosineHit(score, type, spec, d) prelim_scores.append(newscore) else: newscore = CosineHit(0, type, spec, d) prelim_scores = sorted(prelim_scores) # print("Scores are ") # for s in prelim_scores: # print(s.score) # for s in prelim_scores: # if(are_spectrums_same(s.query, s.library) == True): # print("true score is", s.score) scores.append(prelim_scores[-1]) # print("Score taken: ",prelim_scores[-1].score ) return scores
def test_scores_by_reference_sorted(): "Test scores_by_reference method with sort=True." spectrum_1, spectrum_2, spectrum_3, spectrum_4 = spectra() references = [spectrum_1, spectrum_2, spectrum_3] queries = [spectrum_3, spectrum_4, spectrum_2] scores = calculate_scores(references, queries, CosineGreedy()) selected_scores = scores.scores_by_reference(spectrum_2, sort=True) expected_result = [(scores.queries[i], scores.scores[1, i]) for i in [2, 1, 0]] assert selected_scores == expected_result, "Expected different scores." scores_only = numpy.array([x[1]["score"] for x in selected_scores]) scores_expected = numpy.array( [1.0, 0.6129713330865563, 0.1363196353181994]) assert numpy.allclose(scores_only, scores_expected, atol=1e-8), \ "Expected different sorted scores."
def test_cosine_greedy_without_parameters(): spectrum_1 = Spectrum(mz=numpy.array([100, 150, 200, 300, 500, 510, 1100], dtype="float"), intensities=numpy.array( [700, 200, 100, 1000, 200, 5, 500], dtype="float")) spectrum_2 = Spectrum(mz=numpy.array([100, 140, 190, 300, 490, 510, 1090], dtype="float"), intensities=numpy.array( [700, 200, 100, 1000, 200, 5, 500], dtype="float")) norm_spectrum_1 = normalize_intensities(spectrum_1) norm_spectrum_2 = normalize_intensities(spectrum_2) cosine_greedy = CosineGreedy() score, n_matches = cosine_greedy(norm_spectrum_1, norm_spectrum_2) assert score == pytest.approx(0.81421, 0.0001), "Expected different cosine score." assert n_matches == 3
def test_cosine_score_greedy_with_tolerance_2_0(): spectrum_1 = Spectrum(mz=numpy.array([100, 200, 299, 300, 301, 500, 510], dtype="float"), intensities=numpy.array( [10, 10, 500, 100, 200, 20, 100], dtype="float"), metadata=dict()) spectrum_2 = Spectrum(mz=numpy.array([100, 200, 300, 301, 500, 512], dtype="float"), intensities=numpy.array([10, 10, 500, 100, 20, 100], dtype="float"), metadata=dict()) norm_spectrum_1 = normalize_intensities(spectrum_1) norm_spectrum_2 = normalize_intensities(spectrum_2) cosine_greedy = CosineGreedy(tolerance=2.0) score, n_matches = cosine_greedy(norm_spectrum_1, norm_spectrum_2) assert score == pytest.approx(0.903412, 0.0001), "Expected different cosine score." assert n_matches == 6
def test_scores_by_query(): "Test scores_by_query method." spectrum_1 = Spectrum(mz=numpy.array([100, 150, 200.]), intensities=numpy.array([0.7, 0.2, 0.1]), metadata={'id': 'spectrum1'}) spectrum_2 = Spectrum(mz=numpy.array([100, 140, 190.]), intensities=numpy.array([0.4, 0.2, 0.1]), metadata={'id': 'spectrum2'}) spectrum_3 = Spectrum(mz=numpy.array([110, 140, 195.]), intensities=numpy.array([0.6, 0.2, 0.1]), metadata={'id': 'spectrum3'}) spectrum_4 = Spectrum(mz=numpy.array([100, 150, 200.]), intensities=numpy.array([0.6, 0.1, 0.6]), metadata={'id': 'spectrum4'}) references = [spectrum_1, spectrum_2, spectrum_3] queries = [spectrum_2, spectrum_3, spectrum_4] scores = Scores(references, queries, CosineGreedy()).calculate() selected_scores = scores.scores_by_query(spectrum_4) expected_result = [(scores.references[i], *scores.scores[i, 2]) for i in range(3)] assert selected_scores == expected_result, "Expected different scores."
def test_cosine_score_greedy_order_of_arguments(): spectrum_1 = Spectrum(mz=numpy.array([100, 200, 299, 300, 301, 500, 510], dtype="float"), intensities=numpy.array( [10, 10, 500, 100, 200, 20, 100], dtype="float"), metadata=dict()) spectrum_2 = Spectrum(mz=numpy.array([100, 200, 300, 301, 500, 512], dtype="float"), intensities=numpy.array([10, 10, 500, 100, 20, 100], dtype="float"), metadata=dict()) norm_spectrum_1 = normalize_intensities(spectrum_1) norm_spectrum_2 = normalize_intensities(spectrum_2) cosine_greedy = CosineGreedy(tolerance=2.0) score_1_2, n_matches_1_2 = cosine_greedy(norm_spectrum_1, norm_spectrum_2) score_2_1, n_matches_2_1 = cosine_greedy(norm_spectrum_2, norm_spectrum_1) assert score_1_2 == score_2_1, "Expected that the order of the arguments would not matter." assert n_matches_1_2 == n_matches_2_1, "Expected that the order of the arguments would not matter."
def test_cosine_score_greedy_with_tolerance_0_2(): spectrum_1 = Spectrum(mz=numpy.array([100, 150, 200, 300, 500, 510, 1100], dtype="float"), intensities=numpy.array( [700, 200, 100, 1000, 200, 5, 500], dtype="float"), metadata=dict()) spectrum_2 = Spectrum( mz=numpy.array([50, 100, 200, 299.5, 489.5, 510.5, 1040], dtype="float"), intensities=numpy.array([700, 200, 100, 1000, 200, 5, 500], dtype="float"), metadata=dict()) norm_spectrum_1 = normalize_intensities(spectrum_1) norm_spectrum_2 = normalize_intensities(spectrum_2) cosine_greedy = CosineGreedy(tolerance=0.2) score, n_matches = cosine_greedy(norm_spectrum_1, norm_spectrum_2) assert score == pytest.approx(0.081966, 0.0001), "Expected different cosine score." assert n_matches == 2
def test_scores_by_query_sorted(): "Test scores_by_query method with sort=True." builder = SpectrumBuilder() spectrum_1 = builder.with_mz(numpy.array( [100, 150, 200.])).with_intensities(numpy.array([0.7, 0.2, 0.1])).with_metadata({ 'id': 'spectrum1' }).build() spectrum_2 = builder.with_mz(numpy.array( [100, 140, 190.])).with_intensities(numpy.array([0.4, 0.2, 0.1])).with_metadata({ 'id': 'spectrum2' }).build() spectrum_3 = builder.with_mz(numpy.array( [100, 140, 195.])).with_intensities(numpy.array([0.6, 0.2, 0.1])).with_metadata({ 'id': 'spectrum3' }).build() spectrum_4 = builder.with_mz(numpy.array( [100, 150, 200.])).with_intensities(numpy.array([0.6, 0.1, 0.6])).with_metadata({ 'id': 'spectrum4' }).build() references = [spectrum_1, spectrum_2, spectrum_3] queries = [spectrum_2, spectrum_3, spectrum_4] scores = calculate_scores(references, queries, CosineGreedy()) selected_scores = scores.scores_by_query(spectrum_4, sort=True) expected_result = [(scores.references[i], scores.scores[i, 2]) for i in [0, 2, 1]] assert selected_scores == expected_result, "Expected different scores."
def library_matching(documents_query: List[SpectrumDocument], documents_library: List[SpectrumDocument], model, presearch_based_on=["parentmass", "spec2vec-top10"], ignore_non_annotated: bool = True, include_scores=["spec2vec", "cosine", "modcosine"], intensity_weighting_power: float = 0.5, allowed_missing_percentage: float = 0, cosine_tol: float = 0.005, mass_tolerance: float = 1.0): """Selecting potential spectra matches with spectra library. Suitable candidates will be selected by 1) top_n Spec2Vec similarity, and 2) same precursor mass (within given mz_ppm tolerance(s)). For later matching routines, additional scores (cosine, modified cosine) are added as well. Args: -------- documents_query: List containing all spectrum documents that should be queried against the library. documents_library: List containing all library spectrum documents. model: Pretrained word2Vec model. top_n: int, optional Number of entries witht the top_n highest Spec2Vec scores to keep as found matches. Default = 10. ignore_non_annotated: bool, optional If True, only annotated spectra will be considered for matching. Default = True. cosine_tol: float, optional Set tolerance for the cosine and modified cosine score. Default = 0.005 mass_tolerance Specify tolerance for a parentmass match. """ # Initializations found_matches = [] m_mass_matches = None m_spec2vec_similarities = None def get_metadata(documents): metadata = [] for doc in documents: metadata.append(doc._obj.get("smiles")) return metadata library_spectra_metadata = get_metadata(documents_library) if ignore_non_annotated: # Get array of all ids for spectra with smiles library_ids = np.asarray( [i for i, x in enumerate(library_spectra_metadata) if x]) else: library_ids = np.arange(len(documents_library)) msg = "Presearch must be done either by 'parentmass' and/or 'spec2vec-topX'" assert "parentmass" in presearch_based_on or np.any( ["spec2vec" in x for x in presearch_based_on]), msg # 1. Search for top-n Spec2Vec matches ------------------------------------ if np.any(["spec2vec" in x for x in presearch_based_on]): top_n = int([ x.split("top")[1] for x in presearch_based_on if "spec2vec" in x ][0]) print("Pre-selection includes spec2vec top {}.".format(top_n)) spec2vec = Spec2Vec( model=model, intensity_weighting_power=intensity_weighting_power, allowed_missing_percentage=allowed_missing_percentage) m_spec2vec_similarities = spec2vec.matrix( [documents_library[i] for i in library_ids], documents_query) # Select top_n similarity values: selection_spec2vec = np.argpartition(m_spec2vec_similarities, -top_n, axis=0)[-top_n:, :] else: selection_spec2vec = np.empty((0, len(documents_query)), dtype="int") # 2. Search for parent mass based matches --------------------------------- if "parentmass" in presearch_based_on: mass_matching = ParentmassMatch(mass_tolerance) m_mass_matches = mass_matching.matrix( [documents_library[i]._obj for i in library_ids], [x._obj for x in documents_query]) selection_massmatch = [] for i in range(len(documents_query)): selection_massmatch.append(np.where(m_mass_matches[:, i] == 1)[0]) else: selection_massmatch = np.empty((len(documents_query), 0), dtype="int") # 3. Combine found matches ------------------------------------------------ for i in range(len(documents_query)): s2v_top_ids = selection_spec2vec[:, i] mass_match_ids = selection_massmatch[i] all_match_ids = np.unique(np.concatenate( (s2v_top_ids, mass_match_ids))) if len(all_match_ids) > 0: if "modcosine" in include_scores: # Get cosine score for found matches cosine_similarity = CosineGreedy(tolerance=cosine_tol) cosine_scores = [] for match_id in library_ids[all_match_ids]: cosine_scores.append( cosine_similarity.matrix( documents_library[match_id]._obj, documents_query[i]._obj)) else: cosine_scores = len(all_match_ids) * ["not calculated"] if "cosine" in include_scores: # Get modified cosine score for found matches mod_cosine_similarity = ModifiedCosine(tolerance=cosine_tol) mod_cosine_scores = [] for match_id in library_ids[all_match_ids]: mod_cosine_scores.append( mod_cosine_similarity.matrix( documents_library[match_id]._obj, documents_query[i]._obj)) else: mod_cosine_scores = len(all_match_ids) * ["not calculated"] matches_df = pd.DataFrame( { "cosine_score": [x[0] for x in cosine_scores], "cosine_matches": [x[1] for x in cosine_scores], "mod_cosine_score": [x[0] for x in mod_cosine_scores], "mod_cosine_matches": [x[1] for x in mod_cosine_scores] }, index=library_ids[all_match_ids]) if m_mass_matches is not None: matches_df["mass_match"] = m_mass_matches[all_match_ids, i] if m_spec2vec_similarities is not None: matches_df["s2v_score"] = m_spec2vec_similarities[ all_match_ids, i] elif "spec2vec" in include_scores: spec2vec_similarity = Spec2Vec( model=model, intensity_weighting_power=intensity_weighting_power, allowed_missing_percentage=allowed_missing_percentage) spec2vec_scores = [] for match_id in library_ids[all_match_ids]: spec2vec_scores.append( spec2vec_similarity.pair(documents_library[match_id], documents_query[i])) matches_df["s2v_score"] = spec2vec_scores found_matches.append(matches_df.fillna(0)) else: found_matches.append([]) return found_matches
def main(argv): parser = argparse.ArgumentParser( description="Compute MSP similarity scores") parser.add_argument("-s", dest="symmetric", action='store_true', help="Computation is symmetric.") parser.add_argument("--ref", dest="references_filename", type=str, help="Path to reference spectra library.") parser.add_argument("--ref_format", dest="references_format", type=str, help="Reference spectra library file format.") parser.add_argument("queries_filename", type=str, help="Path to query spectra.") parser.add_argument("queries_format", type=str, help="Query spectra file format.") parser.add_argument("similarity_metric", type=str, help='Metric to use for matching.') parser.add_argument("tolerance", type=float, help="Tolerance to use for peak matching.") parser.add_argument( "mz_power", type=float, help="The power to raise mz to in the cosine function.") parser.add_argument( "intensity_power", type=float, help="The power to raise intensity to in the cosine function.") parser.add_argument("output_filename_scores", type=str, help="Path where to store the output .tsv scores.") parser.add_argument("output_filename_matches", type=str, help="Path where to store the output .tsv matches.") args = parser.parse_args() if args.queries_format == 'msp': queries_spectra = list(load_from_msp(args.queries_filename)) elif args.queries_format == 'mgf': queries_spectra = list(load_from_mgf(args.queries_filename)) else: raise ValueError( f'File format {args.queries_format} not supported for query spectra.' ) if args.symmetric: reference_spectra = [] else: if args.references_format == 'msp': reference_spectra = list(load_from_msp(args.references_filename)) elif args.references_format == 'mgf': reference_spectra = list(load_from_mgf(args.references_filename)) else: raise ValueError( f'File format {args.references_format} not supported for reference spectra library.' ) if args.similarity_metric == 'CosineGreedy': similarity_metric = CosineGreedy(args.tolerance, args.mz_power, args.intensity_power) elif args.similarity_metric == 'CosineHungarian': similarity_metric = CosineHungarian(args.tolerance, args.mz_power, args.intensity_power) elif args.similarity_metric == 'ModifiedCosine': similarity_metric = ModifiedCosine(args.tolerance, args.mz_power, args.intensity_power) reference_spectra = list(map(convert_precursor_mz, reference_spectra)) queries_spectra = list(map(convert_precursor_mz, queries_spectra)) else: return -1 print("Calculating scores...") scores = calculate_scores( references=queries_spectra if args.symmetric else reference_spectra, queries=queries_spectra, similarity_function=similarity_metric, is_symmetric=args.symmetric) write_outputs(args, scores) return 0
def test_user_workflow(): def apply_my_filters(s): s = default_filters(s) s = add_parent_mass(s) s = normalize_intensities(s) s = select_by_relative_intensity(s, intensity_from=0.0, intensity_to=1.0) s = select_by_mz(s, mz_from=0, mz_to=1000) s = require_minimum_number_of_peaks(s, n_required=5) return s module_root = os.path.join(os.path.dirname(__file__), '..') spectrums_file = os.path.join(module_root, 'tests', 'pesticides.mgf') # apply my filters to the data spectrums = [apply_my_filters(s) for s in load_from_mgf(spectrums_file)] # omit spectrums that didn't qualify for analysis spectrums = [s for s in spectrums if s is not None] # this will be a library grouping analysis, so queries = references = spectrums queries = spectrums[:] references = spectrums[:] # define similarity function cosine_greedy = CosineGreedy() # calculate_scores scores = list(calculate_scores(references, queries, cosine_greedy)) # filter out self-comparisons, require at least 20 matching peaks: filtered = [(reference, query, score, n_matching) for (reference, query, score, n_matching) in scores if reference != query and n_matching >= 20] sorted_by_score = sorted(filtered, key=lambda elem: elem[2], reverse=True) actual_top10 = sorted_by_score[:10] expected_top10 = [ (references[48], queries[50], pytest.approx(0.9994510368270997, rel=1e-9), 25), (references[50], queries[48], pytest.approx(0.9994510368270997, rel=1e-9), 25), (references[46], queries[48], pytest.approx(0.9981252309590571, rel=1e-9), 27), (references[48], queries[46], pytest.approx(0.9981252309590571, rel=1e-9), 27), (references[46], queries[50], pytest.approx(0.9979632203390496, rel=1e-9), 22), (references[50], queries[46], pytest.approx(0.9979632203390496, rel=1e-9), 22), (references[73], queries[74], pytest.approx(0.9956795920716246, rel=1e-9), 23), (references[74], queries[73], pytest.approx(0.9956795920716246, rel=1e-9), 23), (references[57], queries[59], pytest.approx(0.9886557001269415, rel=1e-9), 46), (references[59], queries[57], pytest.approx(0.9886557001269415, rel=1e-9), 46), ] assert actual_top10 == expected_top10
def main(argv): parser = argparse.ArgumentParser( description="Compute MSP similarity scores") parser.add_argument("-f", dest="default_filters", action='store_true', help="Apply default filters") parser.add_argument("-n", dest="normalize_intensities", action='store_true', help="Normalize intensities.") parser.add_argument("-s", dest="symmetric", action='store_true', help="Computation is symmetric.") parser.add_argument("--ref", dest="references_filename", type=str, help="Path to reference MSP library.") parser.add_argument("queries_filename", type=str, help="Path to query spectra.") parser.add_argument("similarity_metric", type=str, help='Metric to use for matching.') parser.add_argument("tolerance", type=float, help="Tolerance to use for peak matching.") parser.add_argument( "mz_power", type=float, help="The power to raise mz to in the cosine function.") parser.add_argument( "intensity_power", type=float, help="The power to raise intensity to in the cosine function.") parser.add_argument("output_filename_scores", type=str, help="Path where to store the output .csv scores.") parser.add_argument("output_filename_matches", type=str, help="Path where to store the output .csv matches.") args = parser.parse_args() queries_spectra = list(load_from_msp(args.queries_filename)) if args.symmetric: reference_spectra = [] else: reference_spectra = list(load_from_msp(args.references_filename)) if args.default_filters is True: print("Applying default filters...") queries_spectra = list(map(default_filters, queries_spectra)) reference_spectra = list(map(default_filters, reference_spectra)) if args.normalize_intensities is True: print("Normalizing intensities...") queries_spectra = list(map(normalize_intensities, queries_spectra)) reference_spectra = list(map(normalize_intensities, reference_spectra)) if args.similarity_metric == 'CosineGreedy': similarity_metric = CosineGreedy(args.tolerance, args.mz_power, args.intensity_power) elif args.similarity_metric == 'CosineHungarian': similarity_metric = CosineHungarian(args.tolerance, args.mz_power, args.intensity_power) elif args.similarity_metric == 'ModifiedCosine': similarity_metric = ModifiedCosine(args.tolerance, args.mz_power, args.intensity_power) reference_spectra = list(map(add_precursor_mz, reference_spectra)) queries_spectra = list(map(add_precursor_mz, queries_spectra)) else: return -1 print("Calculating scores...") scores = calculate_scores( references=queries_spectra if args.symmetric else reference_spectra, queries=queries_spectra, similarity_function=similarity_metric, is_symmetric=args.symmetric) write_outputs(args, scores) return 0
def library_matching( documents_query: List[SpectrumDocument], documents_library: List[SpectrumDocument], model: BaseTopicModel, presearch_based_on: List[str] = ["precursor_mz", "spec2vec-top10"], ignore_non_annotated: bool = True, include_scores=["spec2vec", "cosine", "modcosine"], intensity_weighting_power: float = 0.5, allowed_missing_percentage: float = 0, cosine_tol: float = 0.005, min_matches: int = 6, mass_tolerance: float = 2.0, mass_tolerance_type: str = "ppm"): """Selecting potential spectra matches with spectra library. Suitable candidates will be selected by 1) top_n Spec2Vec similarity, and 2) same precursor mass (within given mz_ppm tolerance(s)). For later matching routines, additional scores (cosine, modified cosine) are added as well. Args: -------- documents_query: List containing all spectrum documents that should be queried against the library. documents_library: List containing all library spectrum documents. model: Pretrained word2Vec model. presearch_based_on: List with strings to specify which measures to use for the presearch. This can include 'precursor_mz', 'spec2vec-topX', ignore_non_annotated: bool, optional If True, only annotated spectra will be considered for matching. Default = True. cosine_tol: float, optional Set tolerance for the cosine and modified cosine score. Default = 0.005 mass_tolerance Specify tolerance for a mass match. mass_toleramce_type Chose between "ppm" (relative) and "Dalton" (absolute) tolerance type. """ # Initializations found_matches = [] m_mass_matches = None m_spec2vec_similarities = None m_modcos_similarities = None def get_metadata(documents): metadata = [] for doc in documents: metadata.append(doc._obj.get("smiles")) return metadata library_spectra_metadata = get_metadata(documents_library) if ignore_non_annotated: # Get array of all ids for spectra with smiles library_ids = np.asarray( [i for i, x in enumerate(library_spectra_metadata) if x]) else: library_ids = np.arange(len(documents_library)) allowed_presearch_type = ["precursor_mz", "spec2vec-top", "modcos-top"] msg = "Presearch must include one of: " + ", ".join(allowed_presearch_type) assert np.any([(x in y) for x in allowed_presearch_type for y in presearch_based_on]), msg # 1. Search for top-n Spec2Vec matches ------------------------------------ if np.any(["spec2vec" in x for x in presearch_based_on]): top_n = int([ x.split("top")[1] for x in presearch_based_on if "spec2vec" in x ][0]) print(f"Pre-selection includes spec2vec top {top_n}.") spec2vec = Spec2Vec( model=model, intensity_weighting_power=intensity_weighting_power, allowed_missing_percentage=allowed_missing_percentage, progress_bar=True) m_spec2vec_similarities = spec2vec.matrix( [documents_library[i] for i in library_ids], documents_query) # Select top_n similarity values: selection_spec2vec = np.argpartition(m_spec2vec_similarities, -top_n, axis=0)[-top_n:, :] else: selection_spec2vec = np.empty((0, len(documents_query)), dtype="int") # 2. Search for precursor_mz based matches --------------------------------- if "precursor_mz" in presearch_based_on: print( f"Pre-selection includes mass matches within {mass_tolerance} {mass_tolerance_type}." ) mass_matching = PrecursorMzMatch(tolerance=mass_tolerance, tolerance_type=mass_tolerance_type) m_mass_matches = mass_matching.matrix( [documents_library[i]._obj for i in library_ids], [x._obj for x in documents_query]) selection_massmatch = [] for i in range(len(documents_query)): selection_massmatch.append(np.where(m_mass_matches[:, i] == 1)[0]) else: selection_massmatch = np.empty((len(documents_query), 0), dtype="int") # 3. Search for top-n modified cosine matches ------------------------------------ if np.any(["modcos" in x for x in presearch_based_on]): top_n = int([ x.split("top")[1] for x in presearch_based_on if "modcos" in x ][0]) print(f"Pre-selection includes modified cosine top {top_n}.") modcos = ModifiedCosine(tolerance=cosine_tol) n_rows = len(library_ids) n_cols = len(documents_query) m_modcos_similarities = np.zeros([n_rows, n_cols], dtype=np.float64) m_modcos_matches = np.zeros([n_rows, n_cols], dtype=np.float64) for i_ref, reference in enumerate( tqdm([documents_library[i]._obj for i in library_ids])): for i_query, query in enumerate([x._obj for x in documents_query]): score = modcos.pair(reference, query) m_modcos_similarities[i_ref][i_query] = score[0] m_modcos_matches[i_ref][i_query] = score[1] # Select top_n similarity values: m_modcos_selected = m_modcos_similarities.copy() m_modcos_selected[m_modcos_matches < min_matches] = 0 selection_modcos = np.argpartition(m_modcos_selected, -top_n, axis=0)[-top_n:, :] else: selection_modcos = np.empty((0, len(documents_query)), dtype="int") # 4. Combine found matches ------------------------------------------------ if "cosine" in include_scores: print("Calculate cosine score for selected candidates.") if "modcosine" in include_scores: print("Calculate modified cosine score for selected candidates.") for i in tqdm(range(len(documents_query))): s2v_top_ids = selection_spec2vec[:, i] mass_match_ids = selection_massmatch[i] modcos_ids = selection_modcos[:, i] all_match_ids = np.unique( np.concatenate((s2v_top_ids, mass_match_ids, modcos_ids))) if len(all_match_ids) > 0: if "cosine" in include_scores: # Get cosine score for found matches cosine_similarity = CosineGreedy(tolerance=cosine_tol) cosine_scores = [] for match_id in library_ids[all_match_ids]: cosine_scores.append( cosine_similarity.pair( documents_library[match_id]._obj, documents_query[i]._obj)) else: cosine_scores = len(all_match_ids) * ["not calculated"] if m_modcos_similarities is not None: mod_cosine_scores0 = [ x for x in m_modcos_similarities[all_match_ids, i] ] mod_cosine_scores1 = [ x for x in m_modcos_matches[all_match_ids, i] ] mod_cosine_scores = list( zip(mod_cosine_scores0, mod_cosine_scores1)) elif "modcosine" in include_scores: # Get modified cosine score for found matches mod_cosine_similarity = ModifiedCosine(tolerance=cosine_tol) mod_cosine_scores = [] for match_id in library_ids[all_match_ids]: mod_cosine_scores.append( mod_cosine_similarity.pair( documents_library[match_id]._obj, documents_query[i]._obj)) else: mod_cosine_scores = len(all_match_ids) * ["not calculated"] matches_df = pd.DataFrame( { "cosine_score": [x["score"] for x in cosine_scores], "cosine_matches": [x["matches"] for x in cosine_scores], "mod_cosine_score": [x["score"] for x in mod_cosine_scores], "mod_cosine_matches": [x["matches"] for x in mod_cosine_scores] }, index=library_ids[all_match_ids]) if m_mass_matches is not None: matches_df["mass_match"] = m_mass_matches[all_match_ids, i] if m_spec2vec_similarities is not None: matches_df["s2v_score"] = m_spec2vec_similarities[ all_match_ids, i] elif "spec2vec" in include_scores: spec2vec_similarity = Spec2Vec( model=model, intensity_weighting_power=intensity_weighting_power, allowed_missing_percentage=allowed_missing_percentage) spec2vec_scores = [] for match_id in library_ids[all_match_ids]: spec2vec_scores.append( spec2vec_similarity.pair(documents_library[match_id], documents_query[i])) matches_df["s2v_score"] = spec2vec_scores found_matches.append(matches_df.fillna(0)) else: found_matches.append([]) return found_matches
def main(argv): parser = argparse.ArgumentParser( description="Compute MSP similarity scores") parser.add_argument("references_filename", type=str, help="Path to reference MSP library.") parser.add_argument("queries_filename", type=str, help="Path to query spectra.") parser.add_argument("similarity_metric", type=str, help='Metric to use for matching.') parser.add_argument("output_filename_scores", type=str, help="Path where to store the output .csv scores.") parser.add_argument("output_filename_matches", type=str, help="Path where to store the output .csv matches.") parser.add_argument("tolerance", type=float, help="Tolerance to use for peak matching.") parser.add_argument( "mz_power", type=float, help="The power to raise mz to in the cosine function.") parser.add_argument( "intensity_power", type=float, help="The power to raise intensity to in the cosine function.") args = parser.parse_args() reference_spectra = load_from_msp(args.references_filename) queries_spectra = load_from_msp(args.queries_filename) if args.similarity_metric == 'CosineGreedy': similarity_metric = CosineGreedy(args.tolerance, args.mz_power, args.intensity_power) elif args.similarity_metric == 'CosineHungarian': similarity_metric = CosineHungarian(args.tolerance, args.mz_power, args.intensity_power) elif args.similarity_metric == 'ModifiedCosine': similarity_metric = ModifiedCosine(args.tolerance, args.mz_power, args.intensity_power) reference_spectra = map(add_precursor_mz, reference_spectra) queries_spectra = map(add_precursor_mz, queries_spectra) else: return -1 scores = calculate_scores( references=list(reference_spectra), queries=list(queries_spectra), similarity_function=similarity_metric, ) query_names = [spectra.metadata['name'] for spectra in scores.queries] reference_names = [ spectra.metadata['name'] for spectra in scores.references ] # Write scores to dataframe dataframe_scores = DataFrame( data=[entry["score"] for entry in scores.scores], index=reference_names, columns=query_names) dataframe_scores.to_csv(args.output_filename_scores, sep=';') # Write number of matches to dataframe dataframe_matches = DataFrame( data=[entry["matches"] for entry in scores.scores], index=reference_names, columns=query_names) dataframe_matches.to_csv(args.output_filename_matches, sep=';') return 0