示例#1
0
def total_mass_error(observed: Spectrum, alignment: str, tolerance: int) -> float:
    '''The sum of all of the mass errors for every matched mass between the 
    observed and the alignment.
    
    :param observed: observed spectrum
    :type observed: Spectrum
    :param alignment: the string alignment
    :type alignment: str
    :param tolerance: parts per million tolerance allowed when matching masses
    :type tolerance: int

    :returns: sum of the absolute values of all mass errors
    :rtype: float
    '''

    # clean the input alignment
    sequence = alignment.replace('-', '').replace(')', '').replace('(', '')

    # generate the spectrum
    alignment_spectrum = gen_spectra.gen_spectrum(sequence)['spectrum']

    # sort them both
    sorted_observed = sorted(observed.spectrum)
    sorted_alignment = sorted(alignment_spectrum)

    # i is for observed, j for the str alignment
    i, j = 0, 0

    # keep track of total error
    total_error = 0

    while i < len(sorted_observed) and j < len(sorted_alignment):

        # see if the mass at j is +- the mass at i
        da_tol = ppm_to_da(sorted_alignment[j], tolerance)

        # if alignment < observed - tolerance, increment alignment
        if sorted_alignment[j] < sorted_observed[i] - da_tol:
            j += 1

        # if alignment > observed + tolerance, increment observed
        elif sorted_alignment[j] > sorted_observed[i] + da_tol:
            i += 1

        # finally ad tot total error and increment both
        else:
            total_error += abs(sorted_alignment[j] - sorted_observed[i])
            i+= 1
            j += 1

    return total_error
示例#2
0
 def test_ppm_to_da(self):
     mass = 100
     tol = 20
     self.assertEqual(utils.ppm_to_da(mass, tol), .002,
                      '20 ppm of 100 should be .002')
 def boundaries(mass):
     tol = ppm_to_da(mass, ppm_tolerance)
     return [mass - tol, mass + tol]
 def make_boundaries(mz):
     da_tol = ppm_to_da(mz, ppm_tol)
     return [mz - da_tol, mz + da_tol]
示例#5
0
def hybrid_score(
    observed: Spectrum, 
    hybrid_seq: str, 
    ppm_tolerance: int, 
    lesser_point: float = .5, 
    greater_point: float = 1.0
    ) -> float:
    '''A score for hybrid sequences. b ions found to the left of the hybrid 
    junction and y ions found to the right of the hybrid junctions will be 
    rewarded a point of value *lesser_point*. b ions found to the right of the 
    hybrid junction and y ions found to the left of hybrid junction will be 
    awarded a point of value *greater_point*.

    :param observed: observed spectrum
    :type observed: Spectrum
    :param hybrid_seq: hybrid string sequence
    :type hybrid_seq: str
    :param ppm_tolerance: mass error allowed in parts per million when matching masses 
    :type ppm_tolerance: int
    :param lesser_point: point awarded to ions found on their respective side of 
        the hybrid junction. 
        (default is .5)
    :type lesser_point: float
    :param greater_point: point awarded to ions found on their non respective side 
        of the hybrid junction. 
        (default is 1.0)
    :type greater_point: float

    :returns: the score 
    :rtype: float 

    :Example: 

    >>> hybrid_seq = 'ABC-DEF'
    >>> lesser_point = .5
    >>> greater_point = 1.0
    >>> # say our b ions found are A, C, E
    >>> # and y ions found are D, A
    >>> # our scoring then works like
    >>> # .5(bA) + .5(bC) + 1(bE) + .5 (yD) + 1(yA) 
    >>> hybrid_score(spectrum, hybrid_seq, 20, lesser_point, greater_point)
    >>> 3.5
    '''

    if '-' not in hybrid_seq and '(' not in hybrid_seq and ')' not in hybrid_seq:
        return 0

    score = 0

    # get a non hybrid for scoring purposes
    non_hyb = hybrid_seq.replace('-', '').replace('(', '').replace(')', '')

    # get the index to the left of which b ions will only get .5 points
    b_split = hybrid_seq.index('-') if '-' in hybrid_seq else hybrid_seq.index('(')

    # get the index to the right of which y ions will only get .5 points
    y_split = len(hybrid_seq) - hybrid_seq.index('-') if '-' in hybrid_seq else len(hybrid_seq) - hybrid_seq.index(')') 

    # generate b and y separately to be sure
    b_spec = sorted(gen_spectra.gen_spectrum(non_hyb, ion='b')['spectrum'])
    y_spec = sorted(gen_spectra.gen_spectrum(non_hyb, ion='y')['spectrum'])

    # convert the spectra into lists of tuples
    gen_range = lambda x: (x - ppm_to_da(x, ppm_tolerance), x + ppm_to_da(x, ppm_tolerance))
    b_ranges = [gen_range(x) for x in b_spec]
    y_ranges = [gen_range(x) for x in y_spec]

    # do a merge search where we linearly search each mass in the observed twice
    b_range_i = 0
    observed_i = 0
    while b_range_i < len(b_ranges) and observed_i < len(observed.spectrum):

        # if observed is larger than the range, increment range
        if observed.spectrum[observed_i] > b_ranges[b_range_i][1]:
            b_range_i += 1

        # if observed is smaller than the range, increment observed
        elif observed.spectrum[observed_i] < b_ranges[b_range_i][0]:
            observed_i += 1

        # otherwise its in the range, see what to increment score by, and increment observed
        else:
            score += 1 if b_range_i >= b_split else .5
            observed_i += 1

    y_range_i = 0
    observed_i = 0
    while y_range_i < len(y_ranges) and observed_i < len(observed.spectrum):

        # if observed is larger than the range, increment range
        if observed.spectrum[observed_i] > y_ranges[y_range_i][1]:
            y_range_i += 1

        # if observed is smaller than the range, increment observed
        elif observed.spectrum[observed_i] < y_ranges[y_range_i][0]:
            observed_i += 1

        # otherwise its in the range, see what to increment score by, and increment observed
        else:
            score += 1 if y_range_i >= y_split else .5
            observed_i += 1

    return score
示例#6
0
def id_spectrum(
    spectrum: Spectrum, 
    db: Database,
    b_hits: dict, 
    y_hits: dict,
    ppm_tolerance: int, 
    precursor_tolerance: int, 
    n: int,
    digest_type: str = '',
    truth: dict = None, 
    fall_off: dict = None, 
    is_last: bool = False
    ) -> Alignments:
    '''Given the spectrum and initial hits, start the alignment process for 
    the input spectrum

    :param spectrum: observed spectrum in question
    :type spectrum: Spectrum
    :param db: Holds all the source sequences
    :type db: Database
    :param b_hits: all k-mers found from the b-ion search
    :type b_hits: list
    :param y_hits: all k-mers found from the y-ion search
    :type y_hits: list
    :param ppm_tolerance: the parts per million error allowed when trying to match masses
    :type ppm_tolerance: int
    :param precursor_tolerance: the parts per million error allowed when trying to match
        precursor masses
    :type percursor_tolerance: int
    :param n: the number of alignments to save
    :type n: int
    :param digest_type: the digest performed on the sample
        (default is '')
    :type digest_type: str
    :param truth: a set of id keyed spectra with the desired spectra. A better description of what this looks like can be 
        seen in the param.py file. If left None, the program will continue normally
        (default is None)
    :type truth: dict
    :param fall_off: only works if the truth param is set to a dictionary. This is a dictionary (if using multiprocessing, 
        needs to be process safe) where, if a sequence loses the desired sequence, a key value pair of spectrum id, 
        DevFallOffEntry object are added to it. 
        (default is None)
    :type fall_off: dict
    :param is_last: Only works if DEV is set to true in params. If set to true, timing evaluations are done. 
        (default is False)
    :type is_last: bool

    :returns: Alignments for the spectrum. If no alignment can be created, and empty Alignments object is inserted
    :rtype: Alignments
    '''

    # convert the ppm tolerance of the precursor to an int for the rest of the time
    precursor_tolerance = utils.ppm_to_da(spectrum.precursor_mass, precursor_tolerance)

    # score and sort these results
    b_results = sorted([
        (
            kmer, 
            mass_comparisons.optimized_compare_masses(spectrum.spectrum, gen_spectra.gen_spectrum(kmer, ion='b'))
        ) for kmer in b_hits], 
        key=lambda x: (x[1], 1/len(x[0])), 
        reverse=True
    )
    y_results = sorted([
        (
            kmer, 
            mass_comparisons.optimized_compare_masses(spectrum.spectrum, gen_spectra.gen_spectrum(kmer, ion='y'))
        ) for kmer in y_hits], 
        key=lambda x: (x[1], 1/len(x[0])), 
        reverse=True
    )

    # filter out the results
    # 1. take all non-zero values 
    # 2. either take the TOP_X or if > TOP_X have the same score, all of those values
    filtered_b, filtered_y = [], []

    # find the highest b and y scores
    max_b_score = max([x[1] for x in b_results])
    max_y_score = max([x[1] for x in y_results])

    # count the number fo kmers that have the highest value
    num_max_b = sum([1 for x in b_results if x[1] == max_b_score])
    num_max_y = sum([1 for x in y_results if x[1] == max_y_score])

    # if we have more than TOP_X number of the highest score, take all of them
    keep_b_count = max(TOP_X, num_max_b)
    keep_y_count = max(TOP_X, num_max_y)

    # take the afformentioned number of results that > than zero
    filtered_b = [x[0] for x in b_results[:keep_b_count] if x[1] > 0]
    filtered_y = [x[0] for x in y_results[:keep_y_count] if x[1] > 0]

    # if fall off and truth are not none, check to see that we can still make the truth seq
    if truth is not None and fall_off is not None:

        # pull out id, hybrid, and truth seq to make it easier
        _id = spectrum.id
        truth_seq = truth[_id]['sequence']
        is_hybrid = truth[_id]['hybrid']

        if not utils.DEV_contains_truth_parts(truth_seq, is_hybrid, filtered_b, filtered_y):

            # add some metadata about what we kept and what fell off
            metadata = {
                'top_x_b_hits': filtered_b, 
                'top_x_y_hits': filtered_y, 
                'excluded_b_hits': [x[0] for x in b_results[keep_b_count:]],
                'excluded_y_hits': [x[0] for x in y_results[keep_y_count:]], 
                'cut_off_b_score': b_results[keep_b_count - 1][1], 
                'cut_off_y_score': y_results[keep_y_count - 1][1]
            }

            # make dev fall off object and add to fall off
            fall_off[_id] = DEVFallOffEntry(
                is_hybrid, 
                truth_seq, 
                'top_x_filtering', 
                metadata
            )

            # skip this entry all together
            return Alignments(spectrum, [])

    # create an alignment for the spectrum
    return alignment.attempt_alignment(
        spectrum, 
        db, 
        filtered_b, 
        filtered_y, 
        ppm_tolerance=ppm_tolerance, 
        precursor_tolerance=precursor_tolerance,
        n=n, 
        truth=truth, 
        fall_off=fall_off, 
        is_last=is_last
    )