def hamming_distance_to_true_naive(self, true_line, line, query_name, restrict_to_region='', normalize=False, padfo=None, debug=False): """ Hamming distance between the inferred naive sequence and the tue naive sequence. <restrict_to_region> if set, restrict the comparison to the section of the *true* sequence assigned to the given region. NOTE this will not in general correspond to the similarly-assigned region in the inferred naive sequence. if <normalize> divide by sequence length """ true_naive_seq = utils.get_full_naive_seq(self.germlines, true_line) inferred_naive_seq = utils.get_full_naive_seq(self.germlines, line) left_hack_add_on = '' right_hack_add_on = '' if len(true_line['seq']) > len(line['seq']): # ihhhmmm doesn't report the bits of the sequence it erodes off the ends, so we have to add them back on # if len(true_naive_seq) > len(inferred_naive_seq): # hm, now why did I use line['seq'] stuff before? start = true_line['seq'].find(line['seq']) assert start >= 0 end = len(line['seq']) + start left_hack_add_on = true_line['seq'][: start] right_hack_add_on = true_line['seq'][ end :] # extra_penalty = len(left_hack_add_on) + len(right_hack_add_on) inferred_naive_seq = 'N'*len(left_hack_add_on) + inferred_naive_seq + 'N'*len(right_hack_add_on) if debug: print ' adding to inferred naive seq' # if restrict_to_region == '': # print ' before', inferred_naive_seq if padfo is not None: # remove N padding from the inferred sequence inferred_naive_seq = inferred_naive_seq[padfo['padleft'] : ] if padfo['padright'] > 0: inferred_naive_seq = inferred_naive_seq[ : -padfo['padright']] # if restrict_to_region == '': # print ' after ', inferred_naive_seq bounds = None if restrict_to_region != '': bounds = utils.get_regional_naive_seq_bounds(restrict_to_region, self.germlines, true_line) # get the bounds of this *true* region true_naive_seq = true_naive_seq[bounds[0] : bounds[1]] inferred_naive_seq = inferred_naive_seq[bounds[0] : bounds[1]] if debug: print restrict_to_region, 'region, bounds', bounds print ' true ', true_naive_seq print ' infer', inferred_naive_seq if len(true_naive_seq) != len(inferred_naive_seq): raise Exception('still not the same lengths for %s\n %s\n %s' % (query_name, true_naive_seq, inferred_naive_seq)) fraction, len_excluding_ambig = utils.hamming_fraction(true_naive_seq, inferred_naive_seq, return_len_excluding_ambig=True) total_distance = int(fraction * len_excluding_ambig) if len(true_naive_seq) == 0: print 'WARNING zero length sequence in hamming_distance_to_true_naive' return 0 if normalize: return int(100 * (float(total_distance) / len(true_naive_seq))) else: return total_distance
def hamming_distance_to_true_naive(self, true_line, line, query_name, restrict_to_region='', normalize=False, debug=False): """ Hamming distance between the inferred naive sequence and the tue naive sequence. <restrict_to_region> if set, restrict the comparison to the section of the *true* sequence assigned to the given region. NOTE this will not in general correspond to the similarly-assigned region in the inferred naive sequence. if <normalize> divide by sequence length """ true_naive_seq = utils.get_full_naive_seq(self.germlines, true_line) inferred_naive_seq = utils.get_full_naive_seq(self.germlines, line) left_hack_add_on = '' right_hack_add_on = '' if len(true_line['seq']) > len(line['seq']): # ihhhmmm doesn't report the bits of the sequence it erodes off the ends, so we have to add them back on # if len(true_naive_seq) > len(inferred_naive_seq): # hm, now why I did use line['seq'] stuff before? start = true_line['seq'].find(line['seq']) assert start >= 0 end = len(line['seq']) + start left_hack_add_on = true_line['seq'][: start] right_hack_add_on = true_line['seq'][ end :] # extra_penalty = len(left_hack_add_on) + len(right_hack_add_on) inferred_naive_seq = 'x'*len(left_hack_add_on) + inferred_naive_seq + 'x'*len(right_hack_add_on) if debug: print ' adding to inferred naive seq' bounds = None if restrict_to_region != '': bounds = utils.get_regional_naive_seq_bounds(restrict_to_region, self.germlines, true_line) # get the bounds of this *true* region true_naive_seq = true_naive_seq[bounds[0] : bounds[1]] inferred_naive_seq = inferred_naive_seq[bounds[0] : bounds[1]] # if len(true_naive_seq) > len(inferred_naive_seq): if debug: print restrict_to_region, 'region, bounds', bounds print ' true ', true_naive_seq print ' infer', inferred_naive_seq if len(true_naive_seq) != len(inferred_naive_seq): print 'ERROR still not the same lengths for %s' % query_name print ' true ', true_naive_seq print ' infer', inferred_naive_seq sys.exit() total_distance = utils.hamming(true_naive_seq, inferred_naive_seq) if len(true_naive_seq) == 0: print 'WARNING zero length sequence in hamming_distance_to_true_naive' return 0 if normalize: return int(100 * (float(total_distance) / len(true_naive_seq))) else: return total_distance
def resolve_overlapping_matches(line, debug=False, germlines=None): """ joinsolver allows d and j matches (and v and d matches) to overlap... which makes no sense, so arbitrarily split the disputed territory in two. """ # NOTE this does pretty much the same thing as a function in waterer for rpairs in ({'left': 'v', 'right': 'd'}, {'left': 'd', 'right': 'j'}): left_gene = line[rpairs['left'] + '_gene'] right_gene = line[rpairs['right'] + '_gene'] l_qr_seq = line[rpairs['left'] + '_qr_seq'] r_qr_seq = line[rpairs['right'] + '_qr_seq'] all_l_matches = re.findall(l_qr_seq, line['seq']) all_r_matches = re.findall(r_qr_seq, line['seq']) lpos = line['seq'].find(l_qr_seq) # find the *first* occurences rpos = line['seq'].find(r_qr_seq) # of l_qr_seq and r_qr_seq left_match_end = lpos + len( l_qr_seq) # base after the last left-gene-matched base right_match_start = rpos # first base of right-hand match overlap = left_match_end - right_match_start if len(all_l_matches) > 1 or len( all_r_matches ) > 1: # darn, the match occurs more than once, so we have to figure out which one to use. Choose the one that gives the smallest overlap min_delta = 9999 lpos, rpos = -1, -1 for il in range( len(all_l_matches) ): # loop over all the combinations to find the smallest difference lpos = line['seq'].find(l_qr_seq, lpos + 1) for ir in range(len(all_r_matches)): rpos = line['seq'].find(r_qr_seq, rpos + 1) if overlap > lpos + len(l_qr_seq) - rpos: overlap = lpos + len(l_qr_seq) - rpos if len(all_l_matches) > 1: if debug: print ' WARNING %d occurences of %s in %s' % ( len(all_l_matches), l_qr_seq, line['seq']) if len(all_r_matches) > 1: if debug: print ' WARNING %d occurences of %s in %s' % ( len(all_r_matches), r_qr_seq, line['seq']) else: try: assert len(all_l_matches) == 1 and len(all_r_matches) == 1 except: if debug: print ' all_l_matches %d all_r_matches %d' % ( len(all_l_matches), len(all_r_matches)) assert False if overlap > 0: lefthand_portion = int(math.floor(overlap / 2.0)) righthand_portion = int(math.ceil(overlap / 2.0)) if debug: print ' WARNING %s apportioning %d overlapping bases between %s (%d) and %s (%d) matches' % ( line['unique_id'], overlap, rpairs['left'], lefthand_portion, rpairs['right'], righthand_portion) assert lefthand_portion <= len(line[rpairs['left'] + '_gl_seq']) assert righthand_portion <= len(line[rpairs['right'] + '_gl_seq']) if lefthand_portion > 0: # slicing doesn't 'work' with zeros line[rpairs['left'] + '_gl_seq'] = line[rpairs['left'] + '_gl_seq'][:-lefthand_portion] line[rpairs['left'] + '_qr_seq'] = l_qr_seq[:-lefthand_portion] line[rpairs['left'] + '_3p_del'] += lefthand_portion line[rpairs['right'] + '_gl_seq'] = line[rpairs['right'] + '_gl_seq'][righthand_portion:] line[rpairs['right'] + '_qr_seq'] = r_qr_seq[righthand_portion:] line[rpairs['right'] + '_5p_del'] += righthand_portion else: if debug: print ' no %s overlap, not doing anything' % ( rpairs['left'] + rpairs['right']) naive_seq = utils.get_full_naive_seq(germlines, line) muted_seq = line['seq'] if len(naive_seq) != len( muted_seq ): # this will happen if, for instance, there's a really short D match and there's NO F*****G WAY to figure out where it was really supposed to be if debug: print ' unequal lengths:' print ' ', naive_seq print ' ', muted_seq assert False
def hamming_distance_to_true_naive(self, true_line, line, query_name, restrict_to_region='', normalize=False, padfo=None, debug=False): """ Hamming distance between the inferred naive sequence and the tue naive sequence. <restrict_to_region> if set, restrict the comparison to the section of the *true* sequence assigned to the given region. NOTE this will not in general correspond to the similarly-assigned region in the inferred naive sequence. if <normalize> divide by sequence length """ true_naive_seq = utils.get_full_naive_seq(self.germlines, true_line) inferred_naive_seq = utils.get_full_naive_seq(self.germlines, line) if len(true_naive_seq) != len(inferred_naive_seq): print '%20s true inf' % '' for k in true_line: print '%20s %s' % (k, true_line[k]), if k in line: print ' %s' % line[k] else: print ' NOPE' for k in line: if k not in true_line: print ' not in true line %20s %s' % (k, line[k]) raise Exception('%s true and inferred sequences not the same length\n %s\n %s\n' % (line['unique_id'], true_naive_seq, inferred_naive_seq)) # assert False # read through this whole damn thing and make sure it's ok left_hack_add_on = '' right_hack_add_on = '' # if len(true_line['seq']) > len(utils.remove_ambiguous_ends(line['seq'], line['fv_insertion'], line['jf_insertion'])): # ihhhmmm doesn't report the bits of the sequence it erodes off the ends, so we have to add them back on # # if len(true_naive_seq) > len(inferred_naive_seq): # hm, now why did I use line['seq'] stuff before? # assert False # start = true_line['seq'].find(line['seq']) # assert start >= 0 # end = len(line['seq']) + start # left_hack_add_on = true_line['seq'][: start] # right_hack_add_on = true_line['seq'][ end :] # # extra_penalty = len(left_hack_add_on) + len(right_hack_add_on) # inferred_naive_seq = 'N'*len(left_hack_add_on) + inferred_naive_seq + 'N'*len(right_hack_add_on) # if debug: # print ' adding to inferred naive seq' if padfo is not None: # remove N padding from the inferred sequence if debug: print 'removing padfo' print inferred_naive_seq if inferred_naive_seq[padfo['padleft'] : ].count('N') == padfo['padleft']: # this fails to happen if reset_effective_erosions_and_effective_insertions already removed the Ns inferred_naive_seq = inferred_naive_seq[padfo['padleft'] : ] elif debug: # NOTE if no debug, we just fall through, which isok print 'tried to remove non Ns!\n %s\n padleft %d\n' % (inferred_naive_seq, padfo['padleft']) if padfo['padright'] > 0: if inferred_naive_seq[ : padfo['padright']].count('N') == padfo['padright']: # this fails to happen if reset_effective_erosions_and_effective_insertions already removed the Ns inferred_naive_seq = inferred_naive_seq[ : -padfo['padright']] elif debug: # NOTE if no debug, we just fall through, which isok print 'tried to remove non Ns!\n %s\n padright %d\n' % (inferred_naive_seq, padfo['padright']) if debug: print padfo['padleft'] * ' ' + inferred_naive_seq + padfo['padleft'] * ' ' bounds = None if restrict_to_region != '': bounds = utils.get_regional_naive_seq_bounds(restrict_to_region, self.germlines, true_line) # get the bounds of this *true* region if debug: print 'restrict to %s' % restrict_to_region utils.color_mutants(true_naive_seq, inferred_naive_seq, print_result=True, extra_str=' ') utils.color_mutants(true_naive_seq[bounds[0] : bounds[1]], inferred_naive_seq[bounds[0] : bounds[1]], print_result=True, extra_str=' ' + bounds[0]*' ') true_naive_seq = true_naive_seq[bounds[0] : bounds[1]] inferred_naive_seq = inferred_naive_seq[bounds[0] : bounds[1]] if len(true_naive_seq) != len(inferred_naive_seq): raise Exception('still not the same lengths for %s\n %s\n %s' % (query_name, true_naive_seq, inferred_naive_seq)) fraction, len_excluding_ambig = utils.hamming_fraction(true_naive_seq, inferred_naive_seq, return_len_excluding_ambig=True) total_distance = int(fraction * len_excluding_ambig) if len(true_naive_seq) == 0: print 'WARNING zero length sequence in hamming_distance_to_true_naive' return 0 if normalize: return int(100 * (float(total_distance) / len(true_naive_seq))) else: return total_distance
def resolve_overlapping_matches(line, debug=False, germlines=None): """ joinsolver allows d and j matches (and v and d matches) to overlap... which makes no sense, so arbitrarily split the disputed territory in two. """ # NOTE this does pretty much the same thing as a function in waterer for rpairs in ({'left':'v', 'right':'d'}, {'left':'d', 'right':'j'}): left_gene = line[rpairs['left'] + '_gene'] right_gene = line[rpairs['right'] + '_gene'] l_qr_seq = line[rpairs['left'] + '_qr_seq'] r_qr_seq = line[rpairs['right'] + '_qr_seq'] all_l_matches = re.findall(l_qr_seq, line['seq']) all_r_matches = re.findall(r_qr_seq, line['seq']) lpos = line['seq'].find(l_qr_seq) # find the *first* occurences rpos = line['seq'].find(r_qr_seq) # of l_qr_seq and r_qr_seq left_match_end = lpos + len(l_qr_seq) # base after the last left-gene-matched base right_match_start = rpos # first base of right-hand match overlap = left_match_end - right_match_start if len(all_l_matches) > 1 or len(all_r_matches) > 1: # darn, the match occurs more than once, so we have to figure out which one to use. Choose the one that gives the smallest overlap min_delta = 9999 lpos, rpos = -1, -1 for il in range(len(all_l_matches)): # loop over all the combinations to find the smallest difference lpos = line['seq'].find(l_qr_seq, lpos + 1) for ir in range(len(all_r_matches)): rpos = line['seq'].find(r_qr_seq, rpos + 1) if overlap > lpos + len(l_qr_seq) - rpos: overlap = lpos + len(l_qr_seq) - rpos if len(all_l_matches) > 1: if debug: print ' WARNING %d occurences of %s in %s' % (len(all_l_matches), l_qr_seq, line['seq']) if len(all_r_matches) > 1: if debug: print ' WARNING %d occurences of %s in %s' % (len(all_r_matches), r_qr_seq, line['seq']) else: try: assert len(all_l_matches) == 1 and len(all_r_matches) == 1 except: if debug: print ' all_l_matches %d all_r_matches %d' % (len(all_l_matches), len(all_r_matches)) assert False if overlap > 0: lefthand_portion = int(math.floor(overlap / 2.0)) righthand_portion = int(math.ceil(overlap / 2.0)) if debug: print ' WARNING %s apportioning %d overlapping bases between %s (%d) and %s (%d) matches' % (line['unique_id'], overlap, rpairs['left'], lefthand_portion, rpairs['right'], righthand_portion) assert lefthand_portion <= len(line[rpairs['left'] + '_gl_seq']) assert righthand_portion <= len(line[rpairs['right'] + '_gl_seq']) if lefthand_portion > 0: # slicing doesn't 'work' with zeros line[rpairs['left'] + '_gl_seq'] = line[rpairs['left'] + '_gl_seq'][:-lefthand_portion] line[rpairs['left'] + '_qr_seq'] = l_qr_seq[:-lefthand_portion] line[rpairs['left'] + '_3p_del'] += lefthand_portion line[rpairs['right'] + '_gl_seq'] = line[rpairs['right'] + '_gl_seq'][righthand_portion:] line[rpairs['right'] + '_qr_seq'] = r_qr_seq[righthand_portion:] line[rpairs['right'] + '_5p_del'] += righthand_portion else: if debug: print ' no %s overlap, not doing anything' % (rpairs['left'] + rpairs['right']) naive_seq = utils.get_full_naive_seq(germlines, line) muted_seq = line['seq'] if len(naive_seq) != len(muted_seq): # this will happen if, for instance, there's a really short D match and there's NO F*****G WAY to figure out where it was really supposed to be if debug: print ' unequal lengths:' print ' ', naive_seq print ' ', muted_seq assert False