def test_modified_cosine_order_of_input_spectrums(): """Test modified cosine on two spectra in changing order.""" spectrum_1 = Spectrum(mz=numpy.array([100, 150, 200, 300, 500, 510, 1100], dtype="float"), intensities=numpy.array( [700, 200, 100, 1000, 200, 5, 500], dtype="float"), metadata={"precursor_mz": 1000.0}) spectrum_2 = Spectrum( mz=numpy.array([55, 105, 205, 304.5, 494.5, 515.5, 1045], dtype="float"), intensities=numpy.array([700, 200, 100, 1000, 200, 5, 500], dtype="float"), metadata={"precursor_mz": 1005.0}) norm_spectrum_1 = normalize_intensities(spectrum_1) norm_spectrum_2 = normalize_intensities(spectrum_2) modified_cosine = ModifiedCosine(tolerance=2.0) score_1_2, n_matches_1_2 = modified_cosine.pair(norm_spectrum_1, norm_spectrum_2) score_2_1, n_matches_2_1 = modified_cosine.pair(norm_spectrum_2, norm_spectrum_1) assert score_1_2 == score_2_1, "Expected that the order of the arguments would not matter." assert n_matches_1_2 == n_matches_2_1, "Expected that the order of the arguments would not matter."
def test_modified_cosine_with_mass_shifted_and_unshifted_matches(): """Test modified cosine on two spectra with mass set shift. In this example 5 peak pairs are possible, but only 3 should be selected (every peak can only be counted once!)""" spectrum_1 = Spectrum(mz=numpy.array([100, 110, 200, 300, 400, 500, 600], dtype="float"), intensities=numpy.array([100, 50, 1, 80, 1, 1, 50], dtype="float"), metadata={"precursor_mz": 1000.0}) spectrum_2 = Spectrum(mz=numpy.array([110, 200, 300, 310, 700, 800], dtype="float"), intensities=numpy.array([100, 1, 90, 90, 1, 100], dtype="float"), metadata={"precursor_mz": 1010.0}) modified_cosine = ModifiedCosine() score = modified_cosine.pair(spectrum_1, spectrum_2) spec1 = spectrum_1.peaks.intensities spec2 = spectrum_2.peaks.intensities peak_pairs_multiplied = spec1[0] * spec2[0] + spec1[3] * spec2[3] + spec1[ 2] * spec2[1] expected_score = peak_pairs_multiplied / numpy.sqrt( numpy.sum(spec1**2) * numpy.sum(spec2**2)) assert score["score"] == pytest.approx( expected_score, 0.00001), "Expected different cosine score." assert score["matches"] == 3, "Expected 3 matching peaks."
def test_modified_cosine_with_mass_shift(peaks, tolerance, masses, expected_matches): """Test modified cosine on two spectra with mass shift.""" builder = SpectrumBuilder() spectrum_1 = builder.with_mz(peaks[0][0]).with_intensities( peaks[0][1]).with_metadata(metadata={ "precursor_mz": masses[0] }).build() spectrum_2 = builder.with_mz(peaks[1][0]).with_intensities( peaks[1][1]).with_metadata(metadata={ "precursor_mz": masses[1] }).build() norm_spectrum_1 = normalize_intensities(spectrum_1) norm_spectrum_2 = normalize_intensities(spectrum_2) if tolerance is None: modified_cosine = ModifiedCosine() else: modified_cosine = ModifiedCosine(tolerance=tolerance) score = modified_cosine.pair(norm_spectrum_1, norm_spectrum_2) expected_score = compute_expected_score(norm_spectrum_1, norm_spectrum_2, expected_matches) assert score["score"] == pytest.approx( expected_score, 0.0001), "Expected different cosine score." assert score["matches"] == len( expected_matches), "Expected differnt number of matching peaks."
def create_dummy_scores_symmetric_modified_cosine(): spectrums = create_dummy_spectrums() # Create Scores object by calculating dice scores similarity_measure = ModifiedCosine() scores = calculate_scores(spectrums, spectrums, similarity_measure) return scores
def test_modified_cosine_with_mass_shift_5_no_matches_expected(): """Test modified cosine on two spectra with no expected matches.""" spectrum_1 = Spectrum(mz=numpy.array([100, 200, 300], dtype="float"), intensities=numpy.array([10, 10, 500], dtype="float"), metadata={"precursor_mz": 1000.0}) spectrum_2 = Spectrum(mz=numpy.array([120, 220, 320], dtype="float"), intensities=numpy.array([10, 10, 500], dtype="float"), metadata={"precursor_mz": 1005}) norm_spectrum_1 = normalize_intensities(spectrum_1) norm_spectrum_2 = normalize_intensities(spectrum_2) modified_cosine = ModifiedCosine(tolerance=1.0) score = modified_cosine.pair(norm_spectrum_1, norm_spectrum_2) assert score["score"] == pytest.approx(0.0, 1e-5), "Expected different modified cosine score." assert score["matches"] == 0, "Expected 0 matching peaks."
def test_modified_cosine_without_precursor_mz(): """Test without precursor-m/z. Should raise assertion error.""" spectrum_1 = Spectrum(mz=numpy.array([100, 150, 200, 300, 500, 510, 1100], dtype="float"), intensities=numpy.array([700, 200, 100, 1000, 200, 5, 500], dtype="float")) spectrum_2 = Spectrum(mz=numpy.array([100, 140, 190, 300, 490, 510, 1090], dtype="float"), intensities=numpy.array([700, 200, 100, 1000, 200, 5, 500], dtype="float")) norm_spectrum_1 = normalize_intensities(spectrum_1) norm_spectrum_2 = normalize_intensities(spectrum_2) modified_cosine = ModifiedCosine() with pytest.raises(AssertionError) as msg: modified_cosine.pair(norm_spectrum_1, norm_spectrum_2) expected_message = "Precursor_mz missing. Apply 'add_precursor_mz' filter first." assert str(msg.value) == expected_message
def test_modified_cosine_with_mass_shift_5_tolerance_2(): """Test modified cosine on two spectra with mass set shift and tolerance.""" spectrum_1 = Spectrum(mz=numpy.array([100, 200, 299, 300, 301, 500, 510], dtype="float"), intensities=numpy.array([10, 10, 500, 100, 200, 20, 100], dtype="float"), metadata={"precursor_mz": 1000.0}) spectrum_2 = Spectrum(mz=numpy.array([105, 205, 305, 306, 505, 517], dtype="float"), intensities=numpy.array([10, 10, 500, 100, 20, 100], dtype="float"), metadata={"precursor_mz": 1005}) norm_spectrum_1 = normalize_intensities(spectrum_1) norm_spectrum_2 = normalize_intensities(spectrum_2) modified_cosine = ModifiedCosine(tolerance=2.0) score = modified_cosine.pair(norm_spectrum_1, norm_spectrum_2) assert score["score"] == pytest.approx(0.96788, 0.0001), "Expected different modified cosine score." assert score["matches"] == 6, "Expected 6 matching peaks."
def test_modified_cosine_with_mass_shift_5(): """Test modified cosine on two spectra with mass set shift.""" spectrum_1 = Spectrum(mz=numpy.array([100, 150, 200, 300, 500, 510, 1100], dtype="float"), intensities=numpy.array([700, 200, 100, 1000, 200, 5, 500], dtype="float"), metadata={"precursor_mz": 1000.0}) spectrum_2 = Spectrum(mz=numpy.array([55, 105, 205, 304.5, 494.5, 515.5, 1045], dtype="float"), intensities=numpy.array([700, 200, 100, 1000, 200, 5, 500], dtype="float"), metadata={"precursor_mz": 1005.0}) norm_spectrum_1 = normalize_intensities(spectrum_1) norm_spectrum_2 = normalize_intensities(spectrum_2) modified_cosine = ModifiedCosine() score = modified_cosine.pair(norm_spectrum_1, norm_spectrum_2) assert score["score"] == pytest.approx(0.081966, 0.0001), "Expected different cosine score." assert score["matches"] == 2, "Expected 2 matching peaks."
def test_modified_cosine_precursor_mz_as_invalid_string(): """Test modified cosine on two spectra with precursor_mz given as string.""" spectrum_1 = Spectrum(mz=np.array([100, 200, 300], dtype="float"), intensities=np.array([10, 10, 500], dtype="float"), metadata={"precursor_mz": 1000.0}) spectrum_2 = Spectrum(mz=np.array([120, 220, 320], dtype="float"), intensities=np.array([10, 10, 500], dtype="float"), metadata={"precursor_mz": "mz 1005.0"}) norm_spectrum_1 = normalize_intensities(spectrum_1) norm_spectrum_2 = normalize_intensities(spectrum_2) modified_cosine = ModifiedCosine(tolerance=1.0) with pytest.raises(AssertionError) as msg: _ = modified_cosine.pair(norm_spectrum_1, norm_spectrum_2) expected_message = "Precursor_mz missing. Apply 'add_precursor_mz' filter first." assert str(msg.value) == expected_message
def test_modified_cosine_precursor_mz_as_string(caplog): """Test modified cosine on two spectra with precursor_mz given as string.""" spectrum_1 = Spectrum(mz=np.array([100, 200, 300], dtype="float"), intensities=np.array([10, 10, 500], dtype="float"), metadata={"precursor_mz": 1000.0}, metadata_harmonization=False) spectrum_2 = Spectrum(mz=np.array([120, 220, 320], dtype="float"), intensities=np.array([10, 10, 500], dtype="float"), metadata={"precursor_mz": "1005.0"}, metadata_harmonization=False) norm_spectrum_1 = normalize_intensities(spectrum_1) norm_spectrum_2 = normalize_intensities(spectrum_2) modified_cosine = ModifiedCosine(tolerance=1.0) score = modified_cosine.pair(norm_spectrum_1, norm_spectrum_2) assert score["score"] == pytest.approx( 0.0, 1e-5), "Expected different modified cosine score." assert score["matches"] == 0, "Expected 0 matching peaks." expected_msg = "Precursor_mz must be of type int or float. Apply 'add_precursor_mz' filter first." assert expected_msg in caplog.text, "Expected different log message"
def library_matching( documents_query: List[SpectrumDocument], documents_library: List[SpectrumDocument], model: BaseTopicModel, presearch_based_on: List[str] = ["precursor_mz", "spec2vec-top10"], ignore_non_annotated: bool = True, include_scores=["spec2vec", "cosine", "modcosine"], intensity_weighting_power: float = 0.5, allowed_missing_percentage: float = 0, cosine_tol: float = 0.005, min_matches: int = 6, mass_tolerance: float = 2.0, mass_tolerance_type: str = "ppm"): """Selecting potential spectra matches with spectra library. Suitable candidates will be selected by 1) top_n Spec2Vec similarity, and 2) same precursor mass (within given mz_ppm tolerance(s)). For later matching routines, additional scores (cosine, modified cosine) are added as well. Args: -------- documents_query: List containing all spectrum documents that should be queried against the library. documents_library: List containing all library spectrum documents. model: Pretrained word2Vec model. presearch_based_on: List with strings to specify which measures to use for the presearch. This can include 'precursor_mz', 'spec2vec-topX', ignore_non_annotated: bool, optional If True, only annotated spectra will be considered for matching. Default = True. cosine_tol: float, optional Set tolerance for the cosine and modified cosine score. Default = 0.005 mass_tolerance Specify tolerance for a mass match. mass_toleramce_type Chose between "ppm" (relative) and "Dalton" (absolute) tolerance type. """ # Initializations found_matches = [] m_mass_matches = None m_spec2vec_similarities = None m_modcos_similarities = None def get_metadata(documents): metadata = [] for doc in documents: metadata.append(doc._obj.get("smiles")) return metadata library_spectra_metadata = get_metadata(documents_library) if ignore_non_annotated: # Get array of all ids for spectra with smiles library_ids = np.asarray( [i for i, x in enumerate(library_spectra_metadata) if x]) else: library_ids = np.arange(len(documents_library)) allowed_presearch_type = ["precursor_mz", "spec2vec-top", "modcos-top"] msg = "Presearch must include one of: " + ", ".join(allowed_presearch_type) assert np.any([(x in y) for x in allowed_presearch_type for y in presearch_based_on]), msg # 1. Search for top-n Spec2Vec matches ------------------------------------ if np.any(["spec2vec" in x for x in presearch_based_on]): top_n = int([ x.split("top")[1] for x in presearch_based_on if "spec2vec" in x ][0]) print(f"Pre-selection includes spec2vec top {top_n}.") spec2vec = Spec2Vec( model=model, intensity_weighting_power=intensity_weighting_power, allowed_missing_percentage=allowed_missing_percentage, progress_bar=True) m_spec2vec_similarities = spec2vec.matrix( [documents_library[i] for i in library_ids], documents_query) # Select top_n similarity values: selection_spec2vec = np.argpartition(m_spec2vec_similarities, -top_n, axis=0)[-top_n:, :] else: selection_spec2vec = np.empty((0, len(documents_query)), dtype="int") # 2. Search for precursor_mz based matches --------------------------------- if "precursor_mz" in presearch_based_on: print( f"Pre-selection includes mass matches within {mass_tolerance} {mass_tolerance_type}." ) mass_matching = PrecursorMzMatch(tolerance=mass_tolerance, tolerance_type=mass_tolerance_type) m_mass_matches = mass_matching.matrix( [documents_library[i]._obj for i in library_ids], [x._obj for x in documents_query]) selection_massmatch = [] for i in range(len(documents_query)): selection_massmatch.append(np.where(m_mass_matches[:, i] == 1)[0]) else: selection_massmatch = np.empty((len(documents_query), 0), dtype="int") # 3. Search for top-n modified cosine matches ------------------------------------ if np.any(["modcos" in x for x in presearch_based_on]): top_n = int([ x.split("top")[1] for x in presearch_based_on if "modcos" in x ][0]) print(f"Pre-selection includes modified cosine top {top_n}.") modcos = ModifiedCosine(tolerance=cosine_tol) n_rows = len(library_ids) n_cols = len(documents_query) m_modcos_similarities = np.zeros([n_rows, n_cols], dtype=np.float64) m_modcos_matches = np.zeros([n_rows, n_cols], dtype=np.float64) for i_ref, reference in enumerate( tqdm([documents_library[i]._obj for i in library_ids])): for i_query, query in enumerate([x._obj for x in documents_query]): score = modcos.pair(reference, query) m_modcos_similarities[i_ref][i_query] = score[0] m_modcos_matches[i_ref][i_query] = score[1] # Select top_n similarity values: m_modcos_selected = m_modcos_similarities.copy() m_modcos_selected[m_modcos_matches < min_matches] = 0 selection_modcos = np.argpartition(m_modcos_selected, -top_n, axis=0)[-top_n:, :] else: selection_modcos = np.empty((0, len(documents_query)), dtype="int") # 4. Combine found matches ------------------------------------------------ if "cosine" in include_scores: print("Calculate cosine score for selected candidates.") if "modcosine" in include_scores: print("Calculate modified cosine score for selected candidates.") for i in tqdm(range(len(documents_query))): s2v_top_ids = selection_spec2vec[:, i] mass_match_ids = selection_massmatch[i] modcos_ids = selection_modcos[:, i] all_match_ids = np.unique( np.concatenate((s2v_top_ids, mass_match_ids, modcos_ids))) if len(all_match_ids) > 0: if "cosine" in include_scores: # Get cosine score for found matches cosine_similarity = CosineGreedy(tolerance=cosine_tol) cosine_scores = [] for match_id in library_ids[all_match_ids]: cosine_scores.append( cosine_similarity.pair( documents_library[match_id]._obj, documents_query[i]._obj)) else: cosine_scores = len(all_match_ids) * ["not calculated"] if m_modcos_similarities is not None: mod_cosine_scores0 = [ x for x in m_modcos_similarities[all_match_ids, i] ] mod_cosine_scores1 = [ x for x in m_modcos_matches[all_match_ids, i] ] mod_cosine_scores = list( zip(mod_cosine_scores0, mod_cosine_scores1)) elif "modcosine" in include_scores: # Get modified cosine score for found matches mod_cosine_similarity = ModifiedCosine(tolerance=cosine_tol) mod_cosine_scores = [] for match_id in library_ids[all_match_ids]: mod_cosine_scores.append( mod_cosine_similarity.pair( documents_library[match_id]._obj, documents_query[i]._obj)) else: mod_cosine_scores = len(all_match_ids) * ["not calculated"] matches_df = pd.DataFrame( { "cosine_score": [x["score"] for x in cosine_scores], "cosine_matches": [x["matches"] for x in cosine_scores], "mod_cosine_score": [x["score"] for x in mod_cosine_scores], "mod_cosine_matches": [x["matches"] for x in mod_cosine_scores] }, index=library_ids[all_match_ids]) if m_mass_matches is not None: matches_df["mass_match"] = m_mass_matches[all_match_ids, i] if m_spec2vec_similarities is not None: matches_df["s2v_score"] = m_spec2vec_similarities[ all_match_ids, i] elif "spec2vec" in include_scores: spec2vec_similarity = Spec2Vec( model=model, intensity_weighting_power=intensity_weighting_power, allowed_missing_percentage=allowed_missing_percentage) spec2vec_scores = [] for match_id in library_ids[all_match_ids]: spec2vec_scores.append( spec2vec_similarity.pair(documents_library[match_id], documents_query[i])) matches_df["s2v_score"] = spec2vec_scores found_matches.append(matches_df.fillna(0)) else: found_matches.append([]) return found_matches
print("normalising intensities") # Apply filters to clean and enhance each spectrum spectrums = [] for spectrum in file: spectrum = default_filters(spectrum) # Scale peak intensities to maximum of 1 spectrum = normalize_intensities(spectrum) print(spectrum.get('precursor_mz')) spectrums.append(spectrum) scores = calculate_scores( references=spectrums, queries=spectrums, similarity_function=ModifiedCosine(tolerance=args.fragment_tolerance)) spectra_matches = convert.convert_scores(scores) spectra_list = [] for s in spectrums: new = convert.convert_spectrum(s) spectra_list.append(new) else: from msmolnet import read_mgf as mgf input_mgf = f'{args.input}.mgf' print(f"reading file {input_mgf}") spectra_list = mgf.read_mgf(input_mgf) if (args.library):
def main(argv): parser = argparse.ArgumentParser( description="Compute MSP similarity scores") parser.add_argument("-f", dest="default_filters", action='store_true', help="Apply default filters") parser.add_argument("-n", dest="normalize_intensities", action='store_true', help="Normalize intensities.") parser.add_argument("-s", dest="symmetric", action='store_true', help="Computation is symmetric.") parser.add_argument("--ref", dest="references_filename", type=str, help="Path to reference MSP library.") parser.add_argument("queries_filename", type=str, help="Path to query spectra.") parser.add_argument("similarity_metric", type=str, help='Metric to use for matching.') parser.add_argument("tolerance", type=float, help="Tolerance to use for peak matching.") parser.add_argument( "mz_power", type=float, help="The power to raise mz to in the cosine function.") parser.add_argument( "intensity_power", type=float, help="The power to raise intensity to in the cosine function.") parser.add_argument("output_filename_scores", type=str, help="Path where to store the output .csv scores.") parser.add_argument("output_filename_matches", type=str, help="Path where to store the output .csv matches.") args = parser.parse_args() queries_spectra = list(load_from_msp(args.queries_filename)) if args.symmetric: reference_spectra = [] else: reference_spectra = list(load_from_msp(args.references_filename)) if args.default_filters is True: print("Applying default filters...") queries_spectra = list(map(default_filters, queries_spectra)) reference_spectra = list(map(default_filters, reference_spectra)) if args.normalize_intensities is True: print("Normalizing intensities...") queries_spectra = list(map(normalize_intensities, queries_spectra)) reference_spectra = list(map(normalize_intensities, reference_spectra)) if args.similarity_metric == 'CosineGreedy': similarity_metric = CosineGreedy(args.tolerance, args.mz_power, args.intensity_power) elif args.similarity_metric == 'CosineHungarian': similarity_metric = CosineHungarian(args.tolerance, args.mz_power, args.intensity_power) elif args.similarity_metric == 'ModifiedCosine': similarity_metric = ModifiedCosine(args.tolerance, args.mz_power, args.intensity_power) reference_spectra = list(map(add_precursor_mz, reference_spectra)) queries_spectra = list(map(add_precursor_mz, queries_spectra)) else: return -1 print("Calculating scores...") scores = calculate_scores( references=queries_spectra if args.symmetric else reference_spectra, queries=queries_spectra, similarity_function=similarity_metric, is_symmetric=args.symmetric) write_outputs(args, scores) return 0
def main(argv): parser = argparse.ArgumentParser( description="Compute MSP similarity scores") parser.add_argument("references_filename", type=str, help="Path to reference MSP library.") parser.add_argument("queries_filename", type=str, help="Path to query spectra.") parser.add_argument("similarity_metric", type=str, help='Metric to use for matching.') parser.add_argument("output_filename_scores", type=str, help="Path where to store the output .csv scores.") parser.add_argument("output_filename_matches", type=str, help="Path where to store the output .csv matches.") parser.add_argument("tolerance", type=float, help="Tolerance to use for peak matching.") parser.add_argument( "mz_power", type=float, help="The power to raise mz to in the cosine function.") parser.add_argument( "intensity_power", type=float, help="The power to raise intensity to in the cosine function.") args = parser.parse_args() reference_spectra = load_from_msp(args.references_filename) queries_spectra = load_from_msp(args.queries_filename) if args.similarity_metric == 'CosineGreedy': similarity_metric = CosineGreedy(args.tolerance, args.mz_power, args.intensity_power) elif args.similarity_metric == 'CosineHungarian': similarity_metric = CosineHungarian(args.tolerance, args.mz_power, args.intensity_power) elif args.similarity_metric == 'ModifiedCosine': similarity_metric = ModifiedCosine(args.tolerance, args.mz_power, args.intensity_power) reference_spectra = map(add_precursor_mz, reference_spectra) queries_spectra = map(add_precursor_mz, queries_spectra) else: return -1 scores = calculate_scores( references=list(reference_spectra), queries=list(queries_spectra), similarity_function=similarity_metric, ) query_names = [spectra.metadata['name'] for spectra in scores.queries] reference_names = [ spectra.metadata['name'] for spectra in scores.references ] # Write scores to dataframe dataframe_scores = DataFrame( data=[entry["score"] for entry in scores.scores], index=reference_names, columns=query_names) dataframe_scores.to_csv(args.output_filename_scores, sep=';') # Write number of matches to dataframe dataframe_matches = DataFrame( data=[entry["matches"] for entry in scores.scores], index=reference_names, columns=query_names) dataframe_matches.to_csv(args.output_filename_matches, sep=';') return 0
def main(argv): parser = argparse.ArgumentParser( description="Compute MSP similarity scores") parser.add_argument("-s", dest="symmetric", action='store_true', help="Computation is symmetric.") parser.add_argument("--ref", dest="references_filename", type=str, help="Path to reference spectra library.") parser.add_argument("--ref_format", dest="references_format", type=str, help="Reference spectra library file format.") parser.add_argument("queries_filename", type=str, help="Path to query spectra.") parser.add_argument("queries_format", type=str, help="Query spectra file format.") parser.add_argument("similarity_metric", type=str, help='Metric to use for matching.') parser.add_argument("tolerance", type=float, help="Tolerance to use for peak matching.") parser.add_argument( "mz_power", type=float, help="The power to raise mz to in the cosine function.") parser.add_argument( "intensity_power", type=float, help="The power to raise intensity to in the cosine function.") parser.add_argument("output_filename_scores", type=str, help="Path where to store the output .tsv scores.") parser.add_argument("output_filename_matches", type=str, help="Path where to store the output .tsv matches.") args = parser.parse_args() if args.queries_format == 'msp': queries_spectra = list(load_from_msp(args.queries_filename)) elif args.queries_format == 'mgf': queries_spectra = list(load_from_mgf(args.queries_filename)) else: raise ValueError( f'File format {args.queries_format} not supported for query spectra.' ) if args.symmetric: reference_spectra = [] else: if args.references_format == 'msp': reference_spectra = list(load_from_msp(args.references_filename)) elif args.references_format == 'mgf': reference_spectra = list(load_from_mgf(args.references_filename)) else: raise ValueError( f'File format {args.references_format} not supported for reference spectra library.' ) if args.similarity_metric == 'CosineGreedy': similarity_metric = CosineGreedy(args.tolerance, args.mz_power, args.intensity_power) elif args.similarity_metric == 'CosineHungarian': similarity_metric = CosineHungarian(args.tolerance, args.mz_power, args.intensity_power) elif args.similarity_metric == 'ModifiedCosine': similarity_metric = ModifiedCosine(args.tolerance, args.mz_power, args.intensity_power) reference_spectra = list(map(convert_precursor_mz, reference_spectra)) queries_spectra = list(map(convert_precursor_mz, queries_spectra)) else: return -1 print("Calculating scores...") scores = calculate_scores( references=queries_spectra if args.symmetric else reference_spectra, queries=queries_spectra, similarity_function=similarity_metric, is_symmetric=args.symmetric) write_outputs(args, scores) return 0
def library_matching(documents_query: List[SpectrumDocument], documents_library: List[SpectrumDocument], model, presearch_based_on=["parentmass", "spec2vec-top10"], ignore_non_annotated: bool = True, include_scores=["spec2vec", "cosine", "modcosine"], intensity_weighting_power: float = 0.5, allowed_missing_percentage: float = 0, cosine_tol: float = 0.005, mass_tolerance: float = 1.0): """Selecting potential spectra matches with spectra library. Suitable candidates will be selected by 1) top_n Spec2Vec similarity, and 2) same precursor mass (within given mz_ppm tolerance(s)). For later matching routines, additional scores (cosine, modified cosine) are added as well. Args: -------- documents_query: List containing all spectrum documents that should be queried against the library. documents_library: List containing all library spectrum documents. model: Pretrained word2Vec model. top_n: int, optional Number of entries witht the top_n highest Spec2Vec scores to keep as found matches. Default = 10. ignore_non_annotated: bool, optional If True, only annotated spectra will be considered for matching. Default = True. cosine_tol: float, optional Set tolerance for the cosine and modified cosine score. Default = 0.005 mass_tolerance Specify tolerance for a parentmass match. """ # Initializations found_matches = [] m_spec2vec_similarities = None def get_metadata(documents): metadata = [] for doc in documents: metadata.append(doc._obj.get("smiles")) return metadata library_spectra_metadata = get_metadata(documents_library) if ignore_non_annotated: # Get array of all ids for spectra with smiles library_ids = np.asarray( [i for i, x in enumerate(library_spectra_metadata) if x]) else: library_ids = np.arange(len(documents_library)) # 1. Search for top-n Spec2Vec matches ------------------------------------ if np.any(["spec2vec" in x for x in presearch_based_on]): top_n = int([ x.split("top")[1] for x in presearch_based_on if "spec2vec" in x ][0]) print("Pre-selection includes spec2vec top {}.".format(top_n)) spec2vec = Spec2VecParallel( model=model, intensity_weighting_power=intensity_weighting_power, allowed_missing_percentage=allowed_missing_percentage) m_spec2vec_similarities = spec2vec( [documents_library[i] for i in library_ids], documents_query) # Select top_n similarity values: selection_spec2vec = np.argpartition(m_spec2vec_similarities, -top_n, axis=0)[-top_n:, :] else: selection_spec2vec = np.empty((0, len(documents_query)), dtype="int") # 2. Search for parent mass based matches --------------------------------- if "parentmass" in presearch_based_on: mass_matching = ParentmassMatchParallel(mass_tolerance) m_mass_matches = mass_matching( [documents_library[i]._obj for i in library_ids], [x._obj for x in documents_query]) else: m_mass_matches = np.empty((0, len(documents_query)), dtype="int") # 3. Combine found matches ------------------------------------------------ for i in range(len(documents_query)): s2v_top_ids = selection_spec2vec[:, i] mass_match_ids = np.where(m_mass_matches[:, i] == 1)[0] all_match_ids = np.unique(np.concatenate( (s2v_top_ids, mass_match_ids))) if len(all_match_ids) > 0: # Get cosine score for found matches cosine_similarity = CosineGreedyNumba(tolerance=cosine_tol) cosine_scores = [] for match_id in library_ids[all_match_ids]: cosine_scores.append( cosine_similarity(documents_library[match_id]._obj, documents_query[i]._obj)) # Get modified cosine score for found matches mod_cosine_similarity = ModifiedCosine(tolerance=cosine_tol) mod_cosine_scores = [] for match_id in library_ids[all_match_ids]: mod_cosine_scores.append( mod_cosine_similarity(documents_library[match_id]._obj, documents_query[i]._obj)) matches_df = pd.DataFrame( { "mass_match": m_mass_matches[all_match_ids, i], "cosine_score": [x[0] for x in cosine_scores], "cosine_matches": [x[1] for x in cosine_scores], "mod_cosine_score": [x[0] for x in mod_cosine_scores], "mod_cosine_matches": [x[1] for x in mod_cosine_scores] }, index=library_ids[all_match_ids]) if m_spec2vec_similarities is not None: matches_df["s2v_score"] = m_spec2vec_similarities[ all_match_ids, i] elif "spec2vec" in include_scores: spec2vec_similarity = Spec2Vec( model=model, intensity_weighting_power=intensity_weighting_power, allowed_missing_percentage=allowed_missing_percentage) spec2vec_scores = [] for match_id in library_ids[all_match_ids]: spec2vec_scores.append( spec2vec_similarity(documents_library[match_id]._obj, documents_query[i]._obj)) matches_df["s2v_score"] = spec2vec_scores found_matches.append(matches_df.fillna(0)) else: found_matches.append([]) return found_matches
# 'similarities_unique_inchikey_spec2vec_librarymodel.npy') # if not os.path.exists(sims_out): # spec2vec_similarity = Spec2Vec(model, intensity_weighting_power=0.5) # similarity_matrix = spec2vec_similarity.matrix( # uniq_documents_processed, uniq_documents_processed, # is_symmetric=True) # np.save(sims_out, similarity_matrix) # else: # similarity_matrix = np.load(sims_out) # classical mod cosine similarities mod_cos_sims_out = os.path.join( cmd.output_dir, "similarities_unique_inchikey_mod_cosine.npy") if not os.path.exists(mod_cos_sims_out): similarity_measure = ModifiedCosine(tolerance=0.005, mz_power=0, intensity_power=1.0) mod_cos_similarity = similarity_measure.matrix( uniq_spectrums_classical, uniq_spectrums_classical, is_symmetric=True) np.save(mod_cos_sims_out, mod_cos_similarity) else: mod_cos_similarity = np.load(mod_cos_sims_out) # # md s2v similarities # md_sims_out = os.path.join( # cmd.output_dir, # 'similarities_unique_inchikey_mds_spec2vec_librarymodel.npy') # if not os.path.exists(md_sims_out): # md_spec2vec_similarity = Spec2Vec(model_mds,