Пример #1
0
def test_modified_cosine_order_of_input_spectrums():
    """Test modified cosine on two spectra in changing order."""
    spectrum_1 = Spectrum(mz=numpy.array([100, 150, 200, 300, 500, 510, 1100],
                                         dtype="float"),
                          intensities=numpy.array(
                              [700, 200, 100, 1000, 200, 5, 500],
                              dtype="float"),
                          metadata={"precursor_mz": 1000.0})

    spectrum_2 = Spectrum(
        mz=numpy.array([55, 105, 205, 304.5, 494.5, 515.5, 1045],
                       dtype="float"),
        intensities=numpy.array([700, 200, 100, 1000, 200, 5, 500],
                                dtype="float"),
        metadata={"precursor_mz": 1005.0})

    norm_spectrum_1 = normalize_intensities(spectrum_1)
    norm_spectrum_2 = normalize_intensities(spectrum_2)
    modified_cosine = ModifiedCosine(tolerance=2.0)
    score_1_2, n_matches_1_2 = modified_cosine.pair(norm_spectrum_1,
                                                    norm_spectrum_2)
    score_2_1, n_matches_2_1 = modified_cosine.pair(norm_spectrum_2,
                                                    norm_spectrum_1)

    assert score_1_2 == score_2_1, "Expected that the order of the arguments would not matter."
    assert n_matches_1_2 == n_matches_2_1, "Expected that the order of the arguments would not matter."
Пример #2
0
def test_modified_cosine_with_mass_shift(peaks, tolerance, masses,
                                         expected_matches):
    """Test modified cosine on two spectra with mass shift."""
    builder = SpectrumBuilder()
    spectrum_1 = builder.with_mz(peaks[0][0]).with_intensities(
        peaks[0][1]).with_metadata(metadata={
            "precursor_mz": masses[0]
        }).build()
    spectrum_2 = builder.with_mz(peaks[1][0]).with_intensities(
        peaks[1][1]).with_metadata(metadata={
            "precursor_mz": masses[1]
        }).build()

    norm_spectrum_1 = normalize_intensities(spectrum_1)
    norm_spectrum_2 = normalize_intensities(spectrum_2)
    if tolerance is None:
        modified_cosine = ModifiedCosine()
    else:
        modified_cosine = ModifiedCosine(tolerance=tolerance)

    score = modified_cosine.pair(norm_spectrum_1, norm_spectrum_2)
    expected_score = compute_expected_score(norm_spectrum_1, norm_spectrum_2,
                                            expected_matches)
    assert score["score"] == pytest.approx(
        expected_score, 0.0001), "Expected different cosine score."
    assert score["matches"] == len(
        expected_matches), "Expected differnt number of matching peaks."
Пример #3
0
def test_modified_cosine_with_mass_shifted_and_unshifted_matches():
    """Test modified cosine on two spectra with mass set shift.
    In this example 5 peak pairs are possible, but only 3 should be selected (every peak
    can only be counted once!)"""
    spectrum_1 = Spectrum(mz=numpy.array([100, 110, 200, 300, 400, 500, 600],
                                         dtype="float"),
                          intensities=numpy.array([100, 50, 1, 80, 1, 1, 50],
                                                  dtype="float"),
                          metadata={"precursor_mz": 1000.0})

    spectrum_2 = Spectrum(mz=numpy.array([110, 200, 300, 310, 700, 800],
                                         dtype="float"),
                          intensities=numpy.array([100, 1, 90, 90, 1, 100],
                                                  dtype="float"),
                          metadata={"precursor_mz": 1010.0})

    modified_cosine = ModifiedCosine()
    score = modified_cosine.pair(spectrum_1, spectrum_2)
    spec1 = spectrum_1.peaks.intensities
    spec2 = spectrum_2.peaks.intensities
    peak_pairs_multiplied = spec1[0] * spec2[0] + spec1[3] * spec2[3] + spec1[
        2] * spec2[1]
    expected_score = peak_pairs_multiplied / numpy.sqrt(
        numpy.sum(spec1**2) * numpy.sum(spec2**2))
    assert score["score"] == pytest.approx(
        expected_score, 0.00001), "Expected different cosine score."
    assert score["matches"] == 3, "Expected 3 matching peaks."
Пример #4
0
def test_modified_cosine_without_precursor_mz():
    """Test without precursor-m/z. Should raise assertion error."""
    spectrum_1 = Spectrum(mz=numpy.array([100, 150, 200, 300, 500, 510, 1100], dtype="float"),
                          intensities=numpy.array([700, 200, 100, 1000, 200, 5, 500], dtype="float"))

    spectrum_2 = Spectrum(mz=numpy.array([100, 140, 190, 300, 490, 510, 1090], dtype="float"),
                          intensities=numpy.array([700, 200, 100, 1000, 200, 5, 500], dtype="float"))

    norm_spectrum_1 = normalize_intensities(spectrum_1)
    norm_spectrum_2 = normalize_intensities(spectrum_2)
    modified_cosine = ModifiedCosine()

    with pytest.raises(AssertionError) as msg:
        modified_cosine.pair(norm_spectrum_1, norm_spectrum_2)

    expected_message = "Precursor_mz missing. Apply 'add_precursor_mz' filter first."
    assert str(msg.value) == expected_message
Пример #5
0
def test_modified_cosine_with_mass_shift_5_no_matches_expected():
    """Test modified cosine on two spectra with no expected matches."""
    spectrum_1 = Spectrum(mz=numpy.array([100, 200, 300], dtype="float"),
                          intensities=numpy.array([10, 10, 500], dtype="float"),
                          metadata={"precursor_mz": 1000.0})

    spectrum_2 = Spectrum(mz=numpy.array([120, 220, 320], dtype="float"),
                          intensities=numpy.array([10, 10, 500], dtype="float"),
                          metadata={"precursor_mz": 1005})

    norm_spectrum_1 = normalize_intensities(spectrum_1)
    norm_spectrum_2 = normalize_intensities(spectrum_2)
    modified_cosine = ModifiedCosine(tolerance=1.0)
    score = modified_cosine.pair(norm_spectrum_1, norm_spectrum_2)

    assert score["score"] == pytest.approx(0.0, 1e-5), "Expected different modified cosine score."
    assert score["matches"] == 0, "Expected 0 matching peaks."
Пример #6
0
def test_modified_cosine_with_mass_shift_5_tolerance_2():
    """Test modified cosine on two spectra with mass set shift and tolerance."""
    spectrum_1 = Spectrum(mz=numpy.array([100, 200, 299, 300, 301, 500, 510], dtype="float"),
                          intensities=numpy.array([10, 10, 500, 100, 200, 20, 100], dtype="float"),
                          metadata={"precursor_mz": 1000.0})

    spectrum_2 = Spectrum(mz=numpy.array([105, 205, 305, 306, 505, 517], dtype="float"),
                          intensities=numpy.array([10, 10, 500, 100, 20, 100], dtype="float"),
                          metadata={"precursor_mz": 1005})

    norm_spectrum_1 = normalize_intensities(spectrum_1)
    norm_spectrum_2 = normalize_intensities(spectrum_2)
    modified_cosine = ModifiedCosine(tolerance=2.0)
    score = modified_cosine.pair(norm_spectrum_1, norm_spectrum_2)

    assert score["score"] == pytest.approx(0.96788, 0.0001), "Expected different modified cosine score."
    assert score["matches"] == 6, "Expected 6 matching peaks."
Пример #7
0
def test_modified_cosine_with_mass_shift_5():
    """Test modified cosine on two spectra with mass set shift."""
    spectrum_1 = Spectrum(mz=numpy.array([100, 150, 200, 300, 500, 510, 1100], dtype="float"),
                          intensities=numpy.array([700, 200, 100, 1000, 200, 5, 500], dtype="float"),
                          metadata={"precursor_mz": 1000.0})

    spectrum_2 = Spectrum(mz=numpy.array([55, 105, 205, 304.5, 494.5, 515.5, 1045], dtype="float"),
                          intensities=numpy.array([700, 200, 100, 1000, 200, 5, 500], dtype="float"),
                          metadata={"precursor_mz": 1005.0})

    norm_spectrum_1 = normalize_intensities(spectrum_1)
    norm_spectrum_2 = normalize_intensities(spectrum_2)
    modified_cosine = ModifiedCosine()
    score = modified_cosine.pair(norm_spectrum_1, norm_spectrum_2)

    assert score["score"] == pytest.approx(0.081966, 0.0001), "Expected different cosine score."
    assert score["matches"] == 2, "Expected 2 matching peaks."
Пример #8
0
def test_modified_cosine_precursor_mz_as_invalid_string():
    """Test modified cosine on two spectra with precursor_mz given as string."""
    spectrum_1 = Spectrum(mz=np.array([100, 200, 300], dtype="float"),
                          intensities=np.array([10, 10, 500], dtype="float"),
                          metadata={"precursor_mz": 1000.0})

    spectrum_2 = Spectrum(mz=np.array([120, 220, 320], dtype="float"),
                          intensities=np.array([10, 10, 500], dtype="float"),
                          metadata={"precursor_mz": "mz 1005.0"})

    norm_spectrum_1 = normalize_intensities(spectrum_1)
    norm_spectrum_2 = normalize_intensities(spectrum_2)
    modified_cosine = ModifiedCosine(tolerance=1.0)
    with pytest.raises(AssertionError) as msg:
        _ = modified_cosine.pair(norm_spectrum_1, norm_spectrum_2)

    expected_message = "Precursor_mz missing. Apply 'add_precursor_mz' filter first."
    assert str(msg.value) == expected_message
Пример #9
0
def test_modified_cosine_precursor_mz_as_string(caplog):
    """Test modified cosine on two spectra with precursor_mz given as string."""
    spectrum_1 = Spectrum(mz=np.array([100, 200, 300], dtype="float"),
                          intensities=np.array([10, 10, 500], dtype="float"),
                          metadata={"precursor_mz": 1000.0},
                          metadata_harmonization=False)

    spectrum_2 = Spectrum(mz=np.array([120, 220, 320], dtype="float"),
                          intensities=np.array([10, 10, 500], dtype="float"),
                          metadata={"precursor_mz": "1005.0"},
                          metadata_harmonization=False)

    norm_spectrum_1 = normalize_intensities(spectrum_1)
    norm_spectrum_2 = normalize_intensities(spectrum_2)
    modified_cosine = ModifiedCosine(tolerance=1.0)
    score = modified_cosine.pair(norm_spectrum_1, norm_spectrum_2)

    assert score["score"] == pytest.approx(
        0.0, 1e-5), "Expected different modified cosine score."
    assert score["matches"] == 0, "Expected 0 matching peaks."
    expected_msg = "Precursor_mz must be of type int or float. Apply 'add_precursor_mz' filter first."
    assert expected_msg in caplog.text, "Expected different log message"
def library_matching(
        documents_query: List[SpectrumDocument],
        documents_library: List[SpectrumDocument],
        model: BaseTopicModel,
        presearch_based_on: List[str] = ["precursor_mz", "spec2vec-top10"],
        ignore_non_annotated: bool = True,
        include_scores=["spec2vec", "cosine", "modcosine"],
        intensity_weighting_power: float = 0.5,
        allowed_missing_percentage: float = 0,
        cosine_tol: float = 0.005,
        min_matches: int = 6,
        mass_tolerance: float = 2.0,
        mass_tolerance_type: str = "ppm"):
    """Selecting potential spectra matches with spectra library.

    Suitable candidates will be selected by 1) top_n Spec2Vec similarity, and 2)
    same precursor mass (within given mz_ppm tolerance(s)).
    For later matching routines, additional scores (cosine, modified cosine)
    are added as well.

    Args:
    --------
    documents_query:
        List containing all spectrum documents that should be queried against the library.
    documents_library:
        List containing all library spectrum documents.
    model:
        Pretrained word2Vec model.
    presearch_based_on:
        List with strings to specify which measures to use for the presearch.
        This can include 'precursor_mz', 'spec2vec-topX',
    ignore_non_annotated: bool, optional
        If True, only annotated spectra will be considered for matching.
        Default = True.
    cosine_tol: float, optional
        Set tolerance for the cosine and modified cosine score. Default = 0.005
    mass_tolerance
        Specify tolerance for a mass match.
    mass_toleramce_type
        Chose between "ppm" (relative) and "Dalton" (absolute) tolerance type.
    """

    # Initializations
    found_matches = []
    m_mass_matches = None
    m_spec2vec_similarities = None
    m_modcos_similarities = None

    def get_metadata(documents):
        metadata = []
        for doc in documents:
            metadata.append(doc._obj.get("smiles"))
        return metadata

    library_spectra_metadata = get_metadata(documents_library)
    if ignore_non_annotated:
        # Get array of all ids for spectra with smiles
        library_ids = np.asarray(
            [i for i, x in enumerate(library_spectra_metadata) if x])
    else:
        library_ids = np.arange(len(documents_library))

    allowed_presearch_type = ["precursor_mz", "spec2vec-top", "modcos-top"]
    msg = "Presearch must include one of: " + ", ".join(allowed_presearch_type)
    assert np.any([(x in y) for x in allowed_presearch_type
                   for y in presearch_based_on]), msg

    # 1. Search for top-n Spec2Vec matches ------------------------------------
    if np.any(["spec2vec" in x for x in presearch_based_on]):
        top_n = int([
            x.split("top")[1] for x in presearch_based_on if "spec2vec" in x
        ][0])
        print(f"Pre-selection includes spec2vec top {top_n}.")
        spec2vec = Spec2Vec(
            model=model,
            intensity_weighting_power=intensity_weighting_power,
            allowed_missing_percentage=allowed_missing_percentage,
            progress_bar=True)
        m_spec2vec_similarities = spec2vec.matrix(
            [documents_library[i] for i in library_ids], documents_query)

        # Select top_n similarity values:
        selection_spec2vec = np.argpartition(m_spec2vec_similarities,
                                             -top_n,
                                             axis=0)[-top_n:, :]
    else:
        selection_spec2vec = np.empty((0, len(documents_query)), dtype="int")

    # 2. Search for precursor_mz based matches ---------------------------------
    if "precursor_mz" in presearch_based_on:
        print(
            f"Pre-selection includes mass matches within {mass_tolerance} {mass_tolerance_type}."
        )
        mass_matching = PrecursorMzMatch(tolerance=mass_tolerance,
                                         tolerance_type=mass_tolerance_type)
        m_mass_matches = mass_matching.matrix(
            [documents_library[i]._obj for i in library_ids],
            [x._obj for x in documents_query])
        selection_massmatch = []
        for i in range(len(documents_query)):
            selection_massmatch.append(np.where(m_mass_matches[:, i] == 1)[0])
    else:
        selection_massmatch = np.empty((len(documents_query), 0), dtype="int")

    # 3. Search for top-n modified cosine matches ------------------------------------
    if np.any(["modcos" in x for x in presearch_based_on]):
        top_n = int([
            x.split("top")[1] for x in presearch_based_on if "modcos" in x
        ][0])
        print(f"Pre-selection includes modified cosine top {top_n}.")
        modcos = ModifiedCosine(tolerance=cosine_tol)

        n_rows = len(library_ids)
        n_cols = len(documents_query)
        m_modcos_similarities = np.zeros([n_rows, n_cols], dtype=np.float64)
        m_modcos_matches = np.zeros([n_rows, n_cols], dtype=np.float64)
        for i_ref, reference in enumerate(
                tqdm([documents_library[i]._obj for i in library_ids])):
            for i_query, query in enumerate([x._obj for x in documents_query]):
                score = modcos.pair(reference, query)
                m_modcos_similarities[i_ref][i_query] = score[0]
                m_modcos_matches[i_ref][i_query] = score[1]

        # Select top_n similarity values:
        m_modcos_selected = m_modcos_similarities.copy()
        m_modcos_selected[m_modcos_matches < min_matches] = 0
        selection_modcos = np.argpartition(m_modcos_selected, -top_n,
                                           axis=0)[-top_n:, :]
    else:
        selection_modcos = np.empty((0, len(documents_query)), dtype="int")

    # 4. Combine found matches ------------------------------------------------
    if "cosine" in include_scores:
        print("Calculate cosine score for selected candidates.")
    if "modcosine" in include_scores:
        print("Calculate modified cosine score for selected candidates.")

    for i in tqdm(range(len(documents_query))):
        s2v_top_ids = selection_spec2vec[:, i]
        mass_match_ids = selection_massmatch[i]
        modcos_ids = selection_modcos[:, i]

        all_match_ids = np.unique(
            np.concatenate((s2v_top_ids, mass_match_ids, modcos_ids)))

        if len(all_match_ids) > 0:
            if "cosine" in include_scores:
                # Get cosine score for found matches
                cosine_similarity = CosineGreedy(tolerance=cosine_tol)
                cosine_scores = []
                for match_id in library_ids[all_match_ids]:
                    cosine_scores.append(
                        cosine_similarity.pair(
                            documents_library[match_id]._obj,
                            documents_query[i]._obj))
            else:
                cosine_scores = len(all_match_ids) * ["not calculated"]

            if m_modcos_similarities is not None:
                mod_cosine_scores0 = [
                    x for x in m_modcos_similarities[all_match_ids, i]
                ]
                mod_cosine_scores1 = [
                    x for x in m_modcos_matches[all_match_ids, i]
                ]
                mod_cosine_scores = list(
                    zip(mod_cosine_scores0, mod_cosine_scores1))
            elif "modcosine" in include_scores:
                # Get modified cosine score for found matches
                mod_cosine_similarity = ModifiedCosine(tolerance=cosine_tol)
                mod_cosine_scores = []
                for match_id in library_ids[all_match_ids]:
                    mod_cosine_scores.append(
                        mod_cosine_similarity.pair(
                            documents_library[match_id]._obj,
                            documents_query[i]._obj))
            else:
                mod_cosine_scores = len(all_match_ids) * ["not calculated"]

            matches_df = pd.DataFrame(
                {
                    "cosine_score": [x["score"] for x in cosine_scores],
                    "cosine_matches": [x["matches"] for x in cosine_scores],
                    "mod_cosine_score":
                    [x["score"] for x in mod_cosine_scores],
                    "mod_cosine_matches":
                    [x["matches"] for x in mod_cosine_scores]
                },
                index=library_ids[all_match_ids])

            if m_mass_matches is not None:
                matches_df["mass_match"] = m_mass_matches[all_match_ids, i]

            if m_spec2vec_similarities is not None:
                matches_df["s2v_score"] = m_spec2vec_similarities[
                    all_match_ids, i]
            elif "spec2vec" in include_scores:
                spec2vec_similarity = Spec2Vec(
                    model=model,
                    intensity_weighting_power=intensity_weighting_power,
                    allowed_missing_percentage=allowed_missing_percentage)
                spec2vec_scores = []
                for match_id in library_ids[all_match_ids]:
                    spec2vec_scores.append(
                        spec2vec_similarity.pair(documents_library[match_id],
                                                 documents_query[i]))
                matches_df["s2v_score"] = spec2vec_scores
            found_matches.append(matches_df.fillna(0))
        else:
            found_matches.append([])

    return found_matches