def total_mass_error(observed: Spectrum, alignment: str, tolerance: int) -> float: '''The sum of all of the mass errors for every matched mass between the observed and the alignment. :param observed: observed spectrum :type observed: Spectrum :param alignment: the string alignment :type alignment: str :param tolerance: parts per million tolerance allowed when matching masses :type tolerance: int :returns: sum of the absolute values of all mass errors :rtype: float ''' # clean the input alignment sequence = alignment.replace('-', '').replace(')', '').replace('(', '') # generate the spectrum alignment_spectrum = gen_spectra.gen_spectrum(sequence)['spectrum'] # sort them both sorted_observed = sorted(observed.spectrum) sorted_alignment = sorted(alignment_spectrum) # i is for observed, j for the str alignment i, j = 0, 0 # keep track of total error total_error = 0 while i < len(sorted_observed) and j < len(sorted_alignment): # see if the mass at j is +- the mass at i da_tol = ppm_to_da(sorted_alignment[j], tolerance) # if alignment < observed - tolerance, increment alignment if sorted_alignment[j] < sorted_observed[i] - da_tol: j += 1 # if alignment > observed + tolerance, increment observed elif sorted_alignment[j] > sorted_observed[i] + da_tol: i += 1 # finally ad tot total error and increment both else: total_error += abs(sorted_alignment[j] - sorted_observed[i]) i+= 1 j += 1 return total_error
def test_ppm_to_da(self): mass = 100 tol = 20 self.assertEqual(utils.ppm_to_da(mass, tol), .002, '20 ppm of 100 should be .002')
def boundaries(mass): tol = ppm_to_da(mass, ppm_tolerance) return [mass - tol, mass + tol]
def make_boundaries(mz): da_tol = ppm_to_da(mz, ppm_tol) return [mz - da_tol, mz + da_tol]
def hybrid_score( observed: Spectrum, hybrid_seq: str, ppm_tolerance: int, lesser_point: float = .5, greater_point: float = 1.0 ) -> float: '''A score for hybrid sequences. b ions found to the left of the hybrid junction and y ions found to the right of the hybrid junctions will be rewarded a point of value *lesser_point*. b ions found to the right of the hybrid junction and y ions found to the left of hybrid junction will be awarded a point of value *greater_point*. :param observed: observed spectrum :type observed: Spectrum :param hybrid_seq: hybrid string sequence :type hybrid_seq: str :param ppm_tolerance: mass error allowed in parts per million when matching masses :type ppm_tolerance: int :param lesser_point: point awarded to ions found on their respective side of the hybrid junction. (default is .5) :type lesser_point: float :param greater_point: point awarded to ions found on their non respective side of the hybrid junction. (default is 1.0) :type greater_point: float :returns: the score :rtype: float :Example: >>> hybrid_seq = 'ABC-DEF' >>> lesser_point = .5 >>> greater_point = 1.0 >>> # say our b ions found are A, C, E >>> # and y ions found are D, A >>> # our scoring then works like >>> # .5(bA) + .5(bC) + 1(bE) + .5 (yD) + 1(yA) >>> hybrid_score(spectrum, hybrid_seq, 20, lesser_point, greater_point) >>> 3.5 ''' if '-' not in hybrid_seq and '(' not in hybrid_seq and ')' not in hybrid_seq: return 0 score = 0 # get a non hybrid for scoring purposes non_hyb = hybrid_seq.replace('-', '').replace('(', '').replace(')', '') # get the index to the left of which b ions will only get .5 points b_split = hybrid_seq.index('-') if '-' in hybrid_seq else hybrid_seq.index('(') # get the index to the right of which y ions will only get .5 points y_split = len(hybrid_seq) - hybrid_seq.index('-') if '-' in hybrid_seq else len(hybrid_seq) - hybrid_seq.index(')') # generate b and y separately to be sure b_spec = sorted(gen_spectra.gen_spectrum(non_hyb, ion='b')['spectrum']) y_spec = sorted(gen_spectra.gen_spectrum(non_hyb, ion='y')['spectrum']) # convert the spectra into lists of tuples gen_range = lambda x: (x - ppm_to_da(x, ppm_tolerance), x + ppm_to_da(x, ppm_tolerance)) b_ranges = [gen_range(x) for x in b_spec] y_ranges = [gen_range(x) for x in y_spec] # do a merge search where we linearly search each mass in the observed twice b_range_i = 0 observed_i = 0 while b_range_i < len(b_ranges) and observed_i < len(observed.spectrum): # if observed is larger than the range, increment range if observed.spectrum[observed_i] > b_ranges[b_range_i][1]: b_range_i += 1 # if observed is smaller than the range, increment observed elif observed.spectrum[observed_i] < b_ranges[b_range_i][0]: observed_i += 1 # otherwise its in the range, see what to increment score by, and increment observed else: score += 1 if b_range_i >= b_split else .5 observed_i += 1 y_range_i = 0 observed_i = 0 while y_range_i < len(y_ranges) and observed_i < len(observed.spectrum): # if observed is larger than the range, increment range if observed.spectrum[observed_i] > y_ranges[y_range_i][1]: y_range_i += 1 # if observed is smaller than the range, increment observed elif observed.spectrum[observed_i] < y_ranges[y_range_i][0]: observed_i += 1 # otherwise its in the range, see what to increment score by, and increment observed else: score += 1 if y_range_i >= y_split else .5 observed_i += 1 return score
def id_spectrum( spectrum: Spectrum, db: Database, b_hits: dict, y_hits: dict, ppm_tolerance: int, precursor_tolerance: int, n: int, digest_type: str = '', truth: dict = None, fall_off: dict = None, is_last: bool = False ) -> Alignments: '''Given the spectrum and initial hits, start the alignment process for the input spectrum :param spectrum: observed spectrum in question :type spectrum: Spectrum :param db: Holds all the source sequences :type db: Database :param b_hits: all k-mers found from the b-ion search :type b_hits: list :param y_hits: all k-mers found from the y-ion search :type y_hits: list :param ppm_tolerance: the parts per million error allowed when trying to match masses :type ppm_tolerance: int :param precursor_tolerance: the parts per million error allowed when trying to match precursor masses :type percursor_tolerance: int :param n: the number of alignments to save :type n: int :param digest_type: the digest performed on the sample (default is '') :type digest_type: str :param truth: a set of id keyed spectra with the desired spectra. A better description of what this looks like can be seen in the param.py file. If left None, the program will continue normally (default is None) :type truth: dict :param fall_off: only works if the truth param is set to a dictionary. This is a dictionary (if using multiprocessing, needs to be process safe) where, if a sequence loses the desired sequence, a key value pair of spectrum id, DevFallOffEntry object are added to it. (default is None) :type fall_off: dict :param is_last: Only works if DEV is set to true in params. If set to true, timing evaluations are done. (default is False) :type is_last: bool :returns: Alignments for the spectrum. If no alignment can be created, and empty Alignments object is inserted :rtype: Alignments ''' # convert the ppm tolerance of the precursor to an int for the rest of the time precursor_tolerance = utils.ppm_to_da(spectrum.precursor_mass, precursor_tolerance) # score and sort these results b_results = sorted([ ( kmer, mass_comparisons.optimized_compare_masses(spectrum.spectrum, gen_spectra.gen_spectrum(kmer, ion='b')) ) for kmer in b_hits], key=lambda x: (x[1], 1/len(x[0])), reverse=True ) y_results = sorted([ ( kmer, mass_comparisons.optimized_compare_masses(spectrum.spectrum, gen_spectra.gen_spectrum(kmer, ion='y')) ) for kmer in y_hits], key=lambda x: (x[1], 1/len(x[0])), reverse=True ) # filter out the results # 1. take all non-zero values # 2. either take the TOP_X or if > TOP_X have the same score, all of those values filtered_b, filtered_y = [], [] # find the highest b and y scores max_b_score = max([x[1] for x in b_results]) max_y_score = max([x[1] for x in y_results]) # count the number fo kmers that have the highest value num_max_b = sum([1 for x in b_results if x[1] == max_b_score]) num_max_y = sum([1 for x in y_results if x[1] == max_y_score]) # if we have more than TOP_X number of the highest score, take all of them keep_b_count = max(TOP_X, num_max_b) keep_y_count = max(TOP_X, num_max_y) # take the afformentioned number of results that > than zero filtered_b = [x[0] for x in b_results[:keep_b_count] if x[1] > 0] filtered_y = [x[0] for x in y_results[:keep_y_count] if x[1] > 0] # if fall off and truth are not none, check to see that we can still make the truth seq if truth is not None and fall_off is not None: # pull out id, hybrid, and truth seq to make it easier _id = spectrum.id truth_seq = truth[_id]['sequence'] is_hybrid = truth[_id]['hybrid'] if not utils.DEV_contains_truth_parts(truth_seq, is_hybrid, filtered_b, filtered_y): # add some metadata about what we kept and what fell off metadata = { 'top_x_b_hits': filtered_b, 'top_x_y_hits': filtered_y, 'excluded_b_hits': [x[0] for x in b_results[keep_b_count:]], 'excluded_y_hits': [x[0] for x in y_results[keep_y_count:]], 'cut_off_b_score': b_results[keep_b_count - 1][1], 'cut_off_y_score': y_results[keep_y_count - 1][1] } # make dev fall off object and add to fall off fall_off[_id] = DEVFallOffEntry( is_hybrid, truth_seq, 'top_x_filtering', metadata ) # skip this entry all together return Alignments(spectrum, []) # create an alignment for the spectrum return alignment.attempt_alignment( spectrum, db, filtered_b, filtered_y, ppm_tolerance=ppm_tolerance, precursor_tolerance=precursor_tolerance, n=n, truth=truth, fall_off=fall_off, is_last=is_last )