def create_dummy_scores_symmetric(): spectrums = create_dummy_spectrums() # Create Scores object by calculating dice scores similarity_measure = FingerprintSimilarity("dice") scores = calculate_scores(spectrums, spectrums, similarity_measure) return scores
def test_fingerprint_similarity_with_scores_sorting(): """Test if score works with Scores.scores_by_query and sorting.""" spectrum0 = Spectrum(mz=numpy.array([100.0, 101.0], dtype="float"), intensities=numpy.array([0.4, 0.5], dtype="float"), metadata={}) fingerprint1 = numpy.array( [0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0]) spectrum1 = Spectrum(mz=numpy.array([100.0, 101.0], dtype="float"), intensities=numpy.array([0.4, 0.5], dtype="float"), metadata={"fingerprint": fingerprint1}) fingerprint2 = numpy.array( [0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1]) spectrum2 = Spectrum(mz=numpy.array([100.0, 101.0], dtype="float"), intensities=numpy.array([0.4, 0.5], dtype="float"), metadata={"fingerprint": fingerprint2}) similarity_measure = FingerprintSimilarity(set_empty_scores=0, similarity_measure="cosine") scores = calculate_scores([spectrum0, spectrum1, spectrum2], [spectrum0, spectrum1, spectrum2], similarity_measure) scores_by_ref_sorted = scores.scores_by_query(spectrum1, sort=True) expected_scores = numpy.array([1.0, 0.84515425, 0.0]) assert numpy.allclose(numpy.array([x[1] for x in scores_by_ref_sorted]), expected_scores, atol=1e-6), \ "Expected different scores and/or order."
def create_dummy_scores_symmetric_modified_cosine(): spectrums = create_dummy_spectrums() # Create Scores object by calculating dice scores similarity_measure = ModifiedCosine() scores = calculate_scores(spectrums, spectrums, similarity_measure) return scores
def test_scores_by_reference_sorted(): "Test scores_by_reference method with sort=True." spectrum_1 = Spectrum(mz=numpy.array([100, 150, 200.]), intensities=numpy.array([0.7, 0.2, 0.1]), metadata={'id': 'spectrum1'}) spectrum_2 = Spectrum(mz=numpy.array([100, 140, 190.]), intensities=numpy.array([0.4, 0.2, 0.1]), metadata={'id': 'spectrum2'}) spectrum_3 = Spectrum(mz=numpy.array([110, 140, 195.]), intensities=numpy.array([0.6, 0.2, 0.1]), metadata={'id': 'spectrum3'}) spectrum_4 = Spectrum(mz=numpy.array([100, 150, 200.]), intensities=numpy.array([0.6, 0.1, 0.6]), metadata={'id': 'spectrum4'}) references = [spectrum_1, spectrum_2, spectrum_3] queries = [spectrum_3, spectrum_4, spectrum_2] scores = calculate_scores(references, queries, CosineGreedy()) selected_scores = scores.scores_by_reference(spectrum_2, sort=True) expected_result = [(scores.queries[i], scores.scores[1, i]) for i in [2, 1, 0]] assert selected_scores == expected_result, "Expected different scores." scores_only = numpy.array([x[1]["score"] for x in selected_scores]) scores_expected = numpy.array( [1.0, 0.6129713330865563, 0.1363196353181994]) assert numpy.allclose(scores_only, scores_expected, atol=1e-8), \ "Expected different sorted scores."
def library_match(spectra_list, lib_mgf, precursor_tol=1.0, cosine=0.7, n_peaks=3): """Reads a given library mgf file and matches the given spectra to the library spectra using normal cosine. Each test spectra is given the name of the library spectra match with the highest cosine score.""" library = load_from_mgf(lib_mgf) # Apply filters to clean and enhance each spectrum library_spectra = [] for spectrum in library: # spectrum = default_filters(spectrum) # Scale peak intensities to maximum of 1 spectrum = normalize_intensities(spectrum) library_spectra.append(spectrum) scores = calculate_scores(references=library_spectra, queries=spectra_list, similarity_function=CosineHungarian()) scores_list = [] for score in scores: print(score) scores_list.append(score) scores_list.sort(reverse=True, key=lambda tuple: tuple[2])
def create_dummy_scores(): spectrums = create_dummy_spectrums() references = spectrums[:5] queries = spectrums[5:] # Create Scores object by calculating dice scores similarity_measure = FingerprintSimilarity("dice") scores = calculate_scores(references, queries, similarity_measure) return scores
def test_metadata_match_strings(spectrums): """Test basic metadata matching between string entries.""" references = spectrums[:2] queries = spectrums[2:] similarity_score = MetadataMatch(field="instrument_type") scores = calculate_scores(references, queries, similarity_score) assert np.all( scores.scores == [[1, 0], [0, 0]]), "Expected different scores."
def test_metadata_match_numerical(spectrums, tolerance, expected): """Test basic metadata matching between numerical entries.""" references = spectrums[:2] queries = spectrums[2:] similarity_score = MetadataMatch(field="retention_time", matching_type="difference", tolerance=tolerance) scores = calculate_scores(references, queries, similarity_score) assert np.all(scores.scores == expected), "Expected different scores."
def test_user_workflow(): def apply_my_filters(s): s = default_filters(s) s = add_parent_mass(s) s = normalize_intensities(s) s = select_by_relative_intensity(s, intensity_from=0.0, intensity_to=1.0) s = select_by_mz(s, mz_from=0, mz_to=1000) s = require_minimum_number_of_peaks(s, n_required=5) return s module_root = os.path.join(os.path.dirname(__file__), "..") spectrums_file = os.path.join(module_root, "tests", "pesticides.mgf") # apply my filters to the data spectrums = [apply_my_filters(s) for s in load_from_mgf(spectrums_file)] # omit spectrums that didn't qualify for analysis spectrums = [s for s in spectrums if s is not None] # this will be a library grouping analysis, so queries = references = spectrums queries = spectrums[:] references = spectrums[:] # define similarity function cosine_greedy = CosineGreedy(tolerance=0.3) # calculate_scores scores = list(calculate_scores(references, queries, cosine_greedy)) # filter out self-comparisons, require at least 20 matching peaks: filtered = [(reference, query, score, n_matching) for (reference, query, score, n_matching) in scores if reference != query and n_matching >= 20] sorted_by_score = sorted(filtered, key=lambda elem: elem[2], reverse=True) actual_top10 = sorted_by_score[:10] expected_top10 = [ (references[48], queries[50], pytest.approx(0.9994783627790965, rel=1e-9), 25), (references[50], queries[48], pytest.approx(0.9994783627790965, rel=1e-9), 25), (references[46], queries[48], pytest.approx(0.9990141860269471, rel=1e-9), 27), (references[48], queries[46], pytest.approx(0.9990141860269471, rel=1e-9), 27), (references[46], queries[50], pytest.approx(0.9988793406908719, rel=1e-9), 22), (references[50], queries[46], pytest.approx(0.9988793406908719, rel=1e-9), 22), (references[57], queries[59], pytest.approx(0.9982171275552505, rel=1e-9), 46), (references[59], queries[57], pytest.approx(0.9982171275552505, rel=1e-9), 46), (references[73], queries[74], pytest.approx(0.9973823244169199, rel=1e-9), 23), (references[74], queries[73], pytest.approx(0.9973823244169199, rel=1e-9), 23), ] assert actual_top10 == expected_top10
def test_metadata_match_strings_wrong_method(spectrums, caplog): """Test basic metadata matching between string entries.""" references = spectrums[:2] queries = spectrums[2:] similarity_score = MetadataMatch(field="instrument_type", matching_type="difference") scores = calculate_scores(references, queries, similarity_score) assert np.all( scores.scores == [[0, 0], [0, 0]]), "Expected different scores." msg = "not compatible with 'difference' method" assert msg in caplog.text
def test_scores_by_query_non_tuple_score(): "Test scores_by_query method." spectrum_1, spectrum_2, spectrum_3, spectrum_4 = spectra() references = [spectrum_1, spectrum_2, spectrum_3] queries = [spectrum_2, spectrum_3, spectrum_4] scores = calculate_scores(references, queries, IntersectMz()) selected_scores = scores.scores_by_query(spectrum_4) expected_result = [(scores.references[i], scores.scores[i, 2]) for i in range(3)] assert selected_scores == expected_result, "Expected different scores."
def test_scores_by_referencey(): "Test scores_by_reference method." spectrum_1, spectrum_2, spectrum_3, spectrum_4 = spectra() references = [spectrum_1, spectrum_2, spectrum_3] queries = [spectrum_3, spectrum_4] scores = calculate_scores(references, queries, CosineGreedy()) selected_scores = scores.scores_by_reference(spectrum_2) expected_result = [(scores.queries[i], scores.scores[1, i]) for i in range(2)] assert selected_scores == expected_result, "Expected different scores."
def test_user_workflow_spec2vec(): def apply_my_filters(s): s = default_filters(s) s = add_parent_mass(s) s = add_losses(s) s = normalize_intensities(s) s = select_by_relative_intensity(s, intensity_from=0.01, intensity_to=1.0) s = select_by_mz(s, mz_from=0, mz_to=1000) s = require_minimum_number_of_peaks(s, n_required=5) return s module_root = os.path.join(os.path.dirname(__file__), '..') spectrums_file = os.path.join(module_root, 'tests', 'pesticides.mgf') # apply my filters to the data spectrums = [apply_my_filters(s) for s in load_from_mgf(spectrums_file)] # omit spectrums that didn't qualify for analysis spectrums = [s for s in spectrums if s is not None] documents = [SpectrumDocument(s) for s in spectrums] # create and train model model = gensim.models.Word2Vec([d.words for d in documents], size=5, min_count=1) model.train([d.words for d in documents], total_examples=len(documents), epochs=20) # define similarity_function spec2vec = Spec2Vec(model=model, documents=documents) references = documents[:26] queries = documents[25:] # calculate scores on all combinations of references and queries scores = list(calculate_scores(references, queries, spec2vec)) # filter out self-comparisons filtered = [(reference, query, score) for (reference, query, score) in scores if reference != query] sorted_by_score = sorted(filtered, key=lambda elem: elem[2], reverse=True) actual_top10 = sorted_by_score[:10] actual_scores = [score for (reference, query, score) in actual_top10] assert max(actual_scores) > 0.99
def test_scores_by_reference_sorted(): "Test scores_by_reference method with sort=True." spectrum_1, spectrum_2, spectrum_3, spectrum_4 = spectra() references = [spectrum_1, spectrum_2, spectrum_3] queries = [spectrum_3, spectrum_4, spectrum_2] scores = calculate_scores(references, queries, CosineGreedy()) selected_scores = scores.scores_by_reference(spectrum_2, sort=True) expected_result = [(scores.queries[i], scores.scores[1, i]) for i in [2, 1, 0]] assert selected_scores == expected_result, "Expected different scores." scores_only = numpy.array([x[1]["score"] for x in selected_scores]) scores_expected = numpy.array( [1.0, 0.6129713330865563, 0.1363196353181994]) assert numpy.allclose(scores_only, scores_expected, atol=1e-8), \ "Expected different sorted scores."
def test_scores_by_query_non_tuple_score(): "Test scores_by_query method." spectrum_1 = Spectrum(mz=numpy.array([100, 150, 200.]), intensities=numpy.array([0.7, 0.2, 0.1]), metadata={'id': 'spectrum1'}) spectrum_2 = Spectrum(mz=numpy.array([100, 140, 190.]), intensities=numpy.array([0.4, 0.2, 0.1]), metadata={'id': 'spectrum2'}) spectrum_3 = Spectrum(mz=numpy.array([110, 140, 195.]), intensities=numpy.array([0.6, 0.2, 0.1]), metadata={'id': 'spectrum3'}) spectrum_4 = Spectrum(mz=numpy.array([100, 150, 200.]), intensities=numpy.array([0.6, 0.1, 0.6]), metadata={'id': 'spectrum4'}) references = [spectrum_1, spectrum_2, spectrum_3] queries = [spectrum_2, spectrum_3, spectrum_4] scores = calculate_scores(references, queries, IntersectMz()) selected_scores = scores.scores_by_query(spectrum_4) expected_result = [(scores.references[i], scores.scores[i, 2]) for i in range(3)] assert selected_scores == expected_result, "Expected different scores."
def test_scores_by_query_sorted(): "Test scores_by_query method with sort=True." spectrum_1 = Spectrum(mz=numpy.array([100, 150, 200.]), intensities=numpy.array([0.7, 0.2, 0.1]), metadata={'id': 'spectrum1'}) spectrum_2 = Spectrum(mz=numpy.array([100, 140, 190.]), intensities=numpy.array([0.4, 0.2, 0.1]), metadata={'id': 'spectrum2'}) spectrum_3 = Spectrum(mz=numpy.array([100, 140, 195.]), intensities=numpy.array([0.6, 0.2, 0.1]), metadata={'id': 'spectrum3'}) spectrum_4 = Spectrum(mz=numpy.array([100, 150, 200.]), intensities=numpy.array([0.6, 0.1, 0.6]), metadata={'id': 'spectrum4'}) references = [spectrum_1, spectrum_2, spectrum_3] queries = [spectrum_2, spectrum_3, spectrum_4] scores = calculate_scores(references, queries, CosineGreedy()) selected_scores = scores.scores_by_query(spectrum_4, sort=True) expected_result = [(scores.references[i], scores.scores[i, 2]) for i in [0, 2, 1]] assert selected_scores == expected_result, "Expected different scores."
def test_scores_by_referencey(): "Test scores_by_reference method." spectrum_1 = Spectrum(mz=numpy.array([100, 150, 200.]), intensities=numpy.array([0.7, 0.2, 0.1]), metadata={'id': 'spectrum1'}) spectrum_2 = Spectrum(mz=numpy.array([100, 140, 190.]), intensities=numpy.array([0.4, 0.2, 0.1]), metadata={'id': 'spectrum2'}) spectrum_3 = Spectrum(mz=numpy.array([110, 140, 195.]), intensities=numpy.array([0.6, 0.2, 0.1]), metadata={'id': 'spectrum3'}) spectrum_4 = Spectrum(mz=numpy.array([100, 150, 200.]), intensities=numpy.array([0.6, 0.1, 0.6]), metadata={'id': 'spectrum4'}) references = [spectrum_1, spectrum_2, spectrum_3] queries = [spectrum_3, spectrum_4] scores = calculate_scores(references, queries, CosineGreedy()) selected_scores = scores.scores_by_reference(spectrum_2) expected_result = [(scores.queries[i], scores.scores[1, i]) for i in range(2)] assert selected_scores == expected_result, "Expected different scores."
file = load_from_mgf(input_mgf) print(file) print("normalising intensities") # Apply filters to clean and enhance each spectrum spectrums = [] for spectrum in file: spectrum = default_filters(spectrum) # Scale peak intensities to maximum of 1 spectrum = normalize_intensities(spectrum) print(spectrum.get('precursor_mz')) spectrums.append(spectrum) scores = calculate_scores( references=spectrums, queries=spectrums, similarity_function=ModifiedCosine(tolerance=args.fragment_tolerance)) spectra_matches = convert.convert_scores(scores) spectra_list = [] for s in spectrums: new = convert.convert_spectrum(s) spectra_list.append(new) else: from msmolnet import read_mgf as mgf input_mgf = f'{args.input}.mgf' print(f"reading file {input_mgf}") spectra_list = mgf.read_mgf(input_mgf)
def main(argv): parser = argparse.ArgumentParser( description="Compute MSP similarity scores") parser.add_argument("references_filename", type=str, help="Path to reference MSP library.") parser.add_argument("queries_filename", type=str, help="Path to query spectra.") parser.add_argument("similarity_metric", type=str, help='Metric to use for matching.') parser.add_argument("output_filename_scores", type=str, help="Path where to store the output .csv scores.") parser.add_argument("output_filename_matches", type=str, help="Path where to store the output .csv matches.") parser.add_argument("tolerance", type=float, help="Tolerance to use for peak matching.") parser.add_argument( "mz_power", type=float, help="The power to raise mz to in the cosine function.") parser.add_argument( "intensity_power", type=float, help="The power to raise intensity to in the cosine function.") args = parser.parse_args() reference_spectra = load_from_msp(args.references_filename) queries_spectra = load_from_msp(args.queries_filename) if args.similarity_metric == 'CosineGreedy': similarity_metric = CosineGreedy(args.tolerance, args.mz_power, args.intensity_power) elif args.similarity_metric == 'CosineHungarian': similarity_metric = CosineHungarian(args.tolerance, args.mz_power, args.intensity_power) elif args.similarity_metric == 'ModifiedCosine': similarity_metric = ModifiedCosine(args.tolerance, args.mz_power, args.intensity_power) reference_spectra = map(add_precursor_mz, reference_spectra) queries_spectra = map(add_precursor_mz, queries_spectra) else: return -1 scores = calculate_scores( references=list(reference_spectra), queries=list(queries_spectra), similarity_function=similarity_metric, ) query_names = [spectra.metadata['name'] for spectra in scores.queries] reference_names = [ spectra.metadata['name'] for spectra in scores.references ] # Write scores to dataframe dataframe_scores = DataFrame( data=[entry["score"] for entry in scores.scores], index=reference_names, columns=query_names) dataframe_scores.to_csv(args.output_filename_scores, sep=';') # Write number of matches to dataframe dataframe_matches = DataFrame( data=[entry["matches"] for entry in scores.scores], index=reference_names, columns=query_names) dataframe_matches.to_csv(args.output_filename_matches, sep=';') return 0
def main(argv): parser = argparse.ArgumentParser( description="Compute MSP similarity scores") parser.add_argument("-f", dest="default_filters", action='store_true', help="Apply default filters") parser.add_argument("-n", dest="normalize_intensities", action='store_true', help="Normalize intensities.") parser.add_argument("-s", dest="symmetric", action='store_true', help="Computation is symmetric.") parser.add_argument("--ref", dest="references_filename", type=str, help="Path to reference MSP library.") parser.add_argument("queries_filename", type=str, help="Path to query spectra.") parser.add_argument("similarity_metric", type=str, help='Metric to use for matching.') parser.add_argument("tolerance", type=float, help="Tolerance to use for peak matching.") parser.add_argument( "mz_power", type=float, help="The power to raise mz to in the cosine function.") parser.add_argument( "intensity_power", type=float, help="The power to raise intensity to in the cosine function.") parser.add_argument("output_filename_scores", type=str, help="Path where to store the output .csv scores.") parser.add_argument("output_filename_matches", type=str, help="Path where to store the output .csv matches.") args = parser.parse_args() queries_spectra = list(load_from_msp(args.queries_filename)) if args.symmetric: reference_spectra = [] else: reference_spectra = list(load_from_msp(args.references_filename)) if args.default_filters is True: print("Applying default filters...") queries_spectra = list(map(default_filters, queries_spectra)) reference_spectra = list(map(default_filters, reference_spectra)) if args.normalize_intensities is True: print("Normalizing intensities...") queries_spectra = list(map(normalize_intensities, queries_spectra)) reference_spectra = list(map(normalize_intensities, reference_spectra)) if args.similarity_metric == 'CosineGreedy': similarity_metric = CosineGreedy(args.tolerance, args.mz_power, args.intensity_power) elif args.similarity_metric == 'CosineHungarian': similarity_metric = CosineHungarian(args.tolerance, args.mz_power, args.intensity_power) elif args.similarity_metric == 'ModifiedCosine': similarity_metric = ModifiedCosine(args.tolerance, args.mz_power, args.intensity_power) reference_spectra = list(map(add_precursor_mz, reference_spectra)) queries_spectra = list(map(add_precursor_mz, queries_spectra)) else: return -1 print("Calculating scores...") scores = calculate_scores( references=queries_spectra if args.symmetric else reference_spectra, queries=queries_spectra, similarity_function=similarity_metric, is_symmetric=args.symmetric) write_outputs(args, scores) return 0
def test_user_workflow(): def apply_my_filters(s): s = default_filters(s) s = add_parent_mass(s) s = normalize_intensities(s) s = select_by_relative_intensity(s, intensity_from=0.0, intensity_to=1.0) s = select_by_mz(s, mz_from=0, mz_to=1000) s = require_minimum_number_of_peaks(s, n_required=5) return s module_root = os.path.join(os.path.dirname(__file__), '..') spectrums_file = os.path.join(module_root, 'tests', 'pesticides.mgf') # apply my filters to the data spectrums = [apply_my_filters(s) for s in load_from_mgf(spectrums_file)] # omit spectrums that didn't qualify for analysis spectrums = [s for s in spectrums if s is not None] # this will be a library grouping analysis, so queries = references = spectrums queries = spectrums[:] references = spectrums[:] # define similarity function cosine_greedy = CosineGreedy() # calculate_scores scores = list(calculate_scores(references, queries, cosine_greedy)) # filter out self-comparisons, require at least 20 matching peaks: filtered = [(reference, query, score, n_matching) for (reference, query, score, n_matching) in scores if reference != query and n_matching >= 20] sorted_by_score = sorted(filtered, key=lambda elem: elem[2], reverse=True) actual_top10 = sorted_by_score[:10] expected_top10 = [ (references[48], queries[50], pytest.approx(0.9994510368270997, rel=1e-9), 25), (references[50], queries[48], pytest.approx(0.9994510368270997, rel=1e-9), 25), (references[46], queries[48], pytest.approx(0.9981252309590571, rel=1e-9), 27), (references[48], queries[46], pytest.approx(0.9981252309590571, rel=1e-9), 27), (references[46], queries[50], pytest.approx(0.9979632203390496, rel=1e-9), 22), (references[50], queries[46], pytest.approx(0.9979632203390496, rel=1e-9), 22), (references[73], queries[74], pytest.approx(0.9956795920716246, rel=1e-9), 23), (references[74], queries[73], pytest.approx(0.9956795920716246, rel=1e-9), 23), (references[57], queries[59], pytest.approx(0.9886557001269415, rel=1e-9), 46), (references[59], queries[57], pytest.approx(0.9886557001269415, rel=1e-9), 46), ] assert actual_top10 == expected_top10
def main(): parser = argparse.ArgumentParser(description='Creating Spec2Vec Pairs') parser.add_argument('input_mgf', help='input_mgf') parser.add_argument('output_pairs', help='output_pairs') parser.add_argument('model_file', help='model_file') parser.add_argument('--min_score', type=float, default=0.7, help='model_file') args = parser.parse_args() spectra = load_from_mgf(args.input_mgf) filtered_spectra = [post_process(s) for s in spectra] # Omit spectrums that didn't qualify for analysis filtered_spectra = [s for s in filtered_spectra if s is not None] # Create spectrum documents query_documents = [ SpectrumDocument(s, n_decimals=2) for s in filtered_spectra ] #DEBUG #query_documents = query_documents[:100] # Loading the model model = gensim.models.Word2Vec.load(args.model_file) # Define similarity_function spec2vec = Spec2Vec(model=model, intensity_weighting_power=0.5, allowed_missing_percentage=80.0) print("total documents", len(query_documents)) scores = calculate_scores(query_documents, query_documents, spec2vec).scores number_of_spectra = len(query_documents) output_scores_list = [] for i in range(number_of_spectra): for j in range(number_of_spectra): if i <= j: continue i_spectrum = filtered_spectra[i] j_spectrum = filtered_spectra[j] sim = scores[i][j] if sim < args.min_score: continue score_dict = {} score_dict["filename"] = args.input_mgf score_dict["CLUSTERID1"] = i_spectrum.metadata["scans"] score_dict["CLUSTERID2"] = j_spectrum.metadata["scans"] score_dict["Cosine"] = sim score_dict["mz1"] = i_spectrum.metadata["pepmass"][0] score_dict["mz2"] = j_spectrum.metadata["pepmass"][0] score_dict["DeltaMZ"] = score_dict["mz2"] - score_dict["mz1"] score_dict["EdgeAnnotation"] = "Spec2Vec" output_scores_list.append(score_dict) # Saving Data Out pd.DataFrame(output_scores_list).to_csv(args.output_pairs, sep="\t", index=False)
def main(argv): parser = argparse.ArgumentParser( description="Compute MSP similarity scores") parser.add_argument("-s", dest="symmetric", action='store_true', help="Computation is symmetric.") parser.add_argument("--ref", dest="references_filename", type=str, help="Path to reference spectra library.") parser.add_argument("--ref_format", dest="references_format", type=str, help="Reference spectra library file format.") parser.add_argument("queries_filename", type=str, help="Path to query spectra.") parser.add_argument("queries_format", type=str, help="Query spectra file format.") parser.add_argument("similarity_metric", type=str, help='Metric to use for matching.') parser.add_argument("tolerance", type=float, help="Tolerance to use for peak matching.") parser.add_argument( "mz_power", type=float, help="The power to raise mz to in the cosine function.") parser.add_argument( "intensity_power", type=float, help="The power to raise intensity to in the cosine function.") parser.add_argument("output_filename_scores", type=str, help="Path where to store the output .tsv scores.") parser.add_argument("output_filename_matches", type=str, help="Path where to store the output .tsv matches.") args = parser.parse_args() if args.queries_format == 'msp': queries_spectra = list(load_from_msp(args.queries_filename)) elif args.queries_format == 'mgf': queries_spectra = list(load_from_mgf(args.queries_filename)) else: raise ValueError( f'File format {args.queries_format} not supported for query spectra.' ) if args.symmetric: reference_spectra = [] else: if args.references_format == 'msp': reference_spectra = list(load_from_msp(args.references_filename)) elif args.references_format == 'mgf': reference_spectra = list(load_from_mgf(args.references_filename)) else: raise ValueError( f'File format {args.references_format} not supported for reference spectra library.' ) if args.similarity_metric == 'CosineGreedy': similarity_metric = CosineGreedy(args.tolerance, args.mz_power, args.intensity_power) elif args.similarity_metric == 'CosineHungarian': similarity_metric = CosineHungarian(args.tolerance, args.mz_power, args.intensity_power) elif args.similarity_metric == 'ModifiedCosine': similarity_metric = ModifiedCosine(args.tolerance, args.mz_power, args.intensity_power) reference_spectra = list(map(convert_precursor_mz, reference_spectra)) queries_spectra = list(map(convert_precursor_mz, queries_spectra)) else: return -1 print("Calculating scores...") scores = calculate_scores( references=queries_spectra if args.symmetric else reference_spectra, queries=queries_spectra, similarity_function=similarity_metric, is_symmetric=args.symmetric) write_outputs(args, scores) return 0
def test_user_workflow_spec2vec(): """Test typical user workflow to get from mass spectra to spec2vec similarities. This test will run a typical workflow example using a small dataset and a pretrained word2vec model. One main aspect of this is to test if users will get exactly the same spec2vec similarity scores when starting from a word2vec model that was trained and saved elsewhere. """ def apply_my_filters(s): """This is how a user would typically design his own pre- and post- processing pipeline.""" s = default_filters(s) s = add_parent_mass(s) s = normalize_intensities(s) s = reduce_to_number_of_peaks(s, n_required=10, ratio_desired=0.5) s = select_by_mz(s, mz_from=0, mz_to=1000) s = add_losses(s, loss_mz_from=10.0, loss_mz_to=200.0) s = require_minimum_number_of_peaks(s, n_required=5) return s repository_root = os.path.join(os.path.dirname(__file__), "..") spectrums_file = os.path.join(repository_root, "tests", "pesticides.mgf") # apply my filters to the data spectrums = [apply_my_filters(s) for s in load_from_mgf(spectrums_file)] # omit spectrums that didn't qualify for analysis spectrums = [s for s in spectrums if s is not None] documents = [SpectrumDocument(s) for s in spectrums] model_file = os.path.join(repository_root, "integration-tests", "test_user_workflow_spec2vec.model") if os.path.isfile(model_file): model = gensim.models.Word2Vec.load(model_file) else: # create and train model model = gensim.models.Word2Vec([d.words for d in documents], size=5, min_count=1) model.train([d.words for d in documents], total_examples=len(documents), epochs=20) model.save(model_file) # define similarity_function spec2vec = Spec2Vec(model=model, intensity_weighting_power=0.5) references = documents[:26] queries = documents[25:] # calculate scores on all combinations of references and queries scores = list(calculate_scores(references, queries, spec2vec)) # filter out self-comparisons filtered = [(reference, query, score) for (reference, query, score) in scores if reference != query] sorted_by_score = sorted(filtered, key=lambda elem: elem[2], reverse=True) actual_top10 = sorted_by_score[:10] expected_top10 = [(documents[19], documents[25], pytest.approx(0.9999121928249473, rel=1e-9)), (documents[20], documents[25], pytest.approx(0.9998846890269892, rel=1e-9)), (documents[20], documents[45], pytest.approx(0.9998756073673759, rel=1e-9)), (documents[25], documents[45], pytest.approx(0.9998750427994474, rel=1e-9)), (documents[19], documents[27], pytest.approx(0.9998722768460854, rel=1e-9)), (documents[22], documents[27], pytest.approx(0.9998633023352553, rel=1e-9)), (documents[18], documents[27], pytest.approx(0.9998616961532616, rel=1e-9)), (documents[19], documents[45], pytest.approx(0.9998528723697396, rel=1e-9)), (documents[14], documents[71], pytest.approx(0.9998404364805897, rel=1e-9)), (documents[20], documents[27], pytest.approx(0.9998336807761137, rel=1e-9))] assert actual_top10 == expected_top10, "Expected different top 10 table."