def summarize_query(self, query_name, query_seq, all_match_names, all_query_bounds, all_germline_bounds, warnings, first_match_query_bounds): if self.debug: print '%s' % query_name best, match_names, n_matches = {}, {}, {} n_used = {'v':0, 'd':0, 'j':0} k_v_min, k_d_min = 999, 999 k_v_max, k_d_max = 0, 0 for region in utils.regions: all_match_names[region] = sorted(all_match_names[region], reverse=True) match_names[region] = [] codon_positions = {'v':-1, 'd':-1, 'j':-1} # conserved codon positions (v:cysteine, d:dummy, j:tryptophan) for region in utils.regions: n_matches[region] = len(all_match_names[region]) n_skipped = 0 for score, gene in all_match_names[region]: glbounds = all_germline_bounds[gene] qrbounds = all_query_bounds[gene] assert qrbounds[1] <= len(query_seq) # NOTE I'm putting these up avove as well (in process_query), so in time I should remove them from here assert glbounds[1] <= len(self.germline_seqs[region][gene]) assert qrbounds[0] >= 0 assert glbounds[0] >= 0 glmatchseq = self.germline_seqs[region][gene][glbounds[0]:glbounds[1]] # TODO since I'm no longer skipping the genes after the first <args.n_max_per_region>, the OR of k-space below is overly conservative # only use a specified set of genes if self.args.only_genes is not None and gene not in self.args.only_genes: n_skipped += 1 continue # add match to the list n_used[region] += 1 match_names[region].append(gene) self.print_match(region, gene, query_seq, score, glbounds, qrbounds, -1, warnings, skipping=False) # if the germline match and the query match aren't the same length, s-w likely added an insert, which we shouldn't get since the gap-open penalty is jacked up so high if len(glmatchseq) != len(query_seq[qrbounds[0]:qrbounds[1]]): # neurotic double check (um, I think) EDIT hey this totally saved my ass print 'ERROR %d not same length' % query_name print glmatchseq, glbounds[0], glbounds[1] print query_seq[qrbounds[0]:qrbounds[1]] assert False if region == 'v': this_k_v = all_query_bounds[gene][1] # NOTE even if the v match doesn't start at the left hand edge of the query sequence, we still measure k_v from there. # In other words, sw doesn't tell the hmm about it k_v_min = min(this_k_v, k_v_min) k_v_max = max(this_k_v, k_v_max) if region == 'd': this_k_d = all_query_bounds[gene][1] - first_match_query_bounds[1] # end of d minus end of v k_d_min = min(this_k_d, k_d_min) k_d_max = max(this_k_d, k_d_max) # check consistency with best match (since the best match is excised in s-w code, and because ham is run with *one* k_v k_d set) if region not in best: best[region] = gene best[region + '_gl_seq'] = self.germline_seqs[region][gene][glbounds[0]:glbounds[1]] best[region + '_qr_seq'] = query_seq[qrbounds[0]:qrbounds[1]] best[region + '_score'] = score if self.debug and n_skipped > 0: print '%8s skipped %d %s genes' % ('', n_skipped, region) for region in utils.regions: if region not in best: print ' no', region, 'match found for', query_name # NOTE if no d match found, we should really just assume entire d was eroded return # s-w allows d and j matches to overlap, so we need to apportion the disputed bases try: self.shift_overlapping_boundaries(all_query_bounds, all_germline_bounds, query_name, query_seq, best) except AssertionError: print '%s: apportionment failed' % query_name return # check for unproductive rearrangements for region in utils.regions: codon_positions[region] = utils.get_conserved_codon_position(self.cyst_positions, self.tryp_positions, region, best[region], all_germline_bounds[best[region]], all_query_bounds[best[region]], assert_on_fail=False) # position in the query sequence, that is codons_ok = utils.check_both_conserved_codons(query_seq, codon_positions['v'], codon_positions['j'], debug=self.debug, extra_str=' ', assert_on_fail=False) cdr3_length = codon_positions['j'] - codon_positions['v'] + 3 in_frame_cdr3 = (cdr3_length % 3 == 0) if self.debug and not in_frame_cdr3: print ' out of frame cdr3: %d %% 3 = %d' % (cdr3_length, cdr3_length % 3) no_stop_codon = utils.stop_codon_check(query_seq, codon_positions['v'], debug=self.debug) if not codons_ok or not in_frame_cdr3 or not no_stop_codon: if self.debug: print ' unproductive rearrangement in waterer codons_ok: %s in_frame_cdr3: %s no_stop_codon: %s' % (codons_ok, in_frame_cdr3, no_stop_codon) if self.args.skip_unproductive: if self.debug: print ' ...skipping' self.n_unproductive += 1 self.info['skipped_unproductive_queries'].append(query_name) return # best k_v, k_d: k_v = all_query_bounds[best['v']][1] # end of v match k_d = all_query_bounds[best['d']][1] - all_query_bounds[best['v']][1] # end of d minus end of v if k_d_max < 5: # since the s-w step matches to the longest possible j and then excises it, this sometimes gobbles up the d, resulting in a very short d alignment. if self.debug: print ' expanding k_d' k_d_max = max(8, k_d_max) if 'IGHJ4*' in best['j'] and self.germline_seqs['d'][best['d']][-5:] == 'ACTAC': # the end of some d versions is the same as the start of some j versions, so the s-w frequently kicks out the 'wrong' alignment if self.debug: print ' doubly expanding k_d' if k_d_max-k_d_min < 8: k_d_min -= 5 k_d_max += 2 k_v_min = max(0, k_v_min - self.args.default_v_fuzz) # ok, so I don't *actually* want it to be zero... oh, well k_v_max += self.args.default_v_fuzz k_d_min = max(1, k_d_min - self.args.default_d_fuzz) k_d_max += self.args.default_d_fuzz assert k_v_min > 0 and k_d_min > 0 and k_v_max > 0 and k_d_max > 0 if self.debug: print ' k_v: %d [%d-%d)' % (k_v, k_v_min, k_v_max) print ' k_d: %d [%d-%d)' % (k_d, k_d_min, k_d_max) print ' used', for region in utils.regions: print ' %s: %d/%d' % (region, n_used[region], n_matches[region]), print '' kvals = {} kvals['v'] = {'best':k_v, 'min':k_v_min, 'max':k_v_max} kvals['d'] = {'best':k_d, 'min':k_d_min, 'max':k_d_max} self.add_to_info(query_name, query_seq, kvals, match_names, best, all_germline_bounds, all_query_bounds, codon_positions=codon_positions)
def summarize_query(self, query_name, query_seq, all_match_names, all_query_bounds, all_germline_bounds, warnings, first_match_query_bounds): if self.debug: print '%s' % query_name best, match_names, n_matches = {}, {}, {} n_used = {'v': 0, 'd': 0, 'j': 0} k_v_min, k_d_min = 999, 999 k_v_max, k_d_max = 0, 0 for region in utils.regions: all_match_names[region] = sorted(all_match_names[region], reverse=True) match_names[region] = [] codon_positions = { 'v': -1, 'd': -1, 'j': -1 } # conserved codon positions (v:cysteine, d:dummy, j:tryptophan) for region in utils.regions: n_matches[region] = len(all_match_names[region]) n_skipped = 0 for score, gene in all_match_names[region]: glbounds = all_germline_bounds[gene] qrbounds = all_query_bounds[gene] assert qrbounds[1] <= len( query_seq ) # NOTE I'm putting these up avove as well (in process_query), so in time I should remove them from here assert glbounds[1] <= len(self.germline_seqs[region][gene]) assert qrbounds[0] >= 0 assert glbounds[0] >= 0 glmatchseq = self.germline_seqs[region][gene][ glbounds[0]:glbounds[1]] # TODO since I'm no longer skipping the genes after the first <args.n_max_per_region>, the OR of k-space below is overly conservative # only use a specified set of genes if self.args.only_genes is not None and gene not in self.args.only_genes: n_skipped += 1 continue # add match to the list n_used[region] += 1 match_names[region].append(gene) self.print_match(region, gene, query_seq, score, glbounds, qrbounds, -1, warnings, skipping=False) # if the germline match and the query match aren't the same length, s-w likely added an insert, which we shouldn't get since the gap-open penalty is jacked up so high if len(glmatchseq) != len( query_seq[qrbounds[0]:qrbounds[1]] ): # neurotic double check (um, I think) EDIT hey this totally saved my ass print 'ERROR %d not same length' % query_name print glmatchseq, glbounds[0], glbounds[1] print query_seq[qrbounds[0]:qrbounds[1]] assert False if region == 'v': this_k_v = all_query_bounds[gene][ 1] # NOTE even if the v match doesn't start at the left hand edge of the query sequence, we still measure k_v from there. # In other words, sw doesn't tell the hmm about it k_v_min = min(this_k_v, k_v_min) k_v_max = max(this_k_v, k_v_max) if region == 'd': this_k_d = all_query_bounds[gene][ 1] - first_match_query_bounds[ 1] # end of d minus end of v k_d_min = min(this_k_d, k_d_min) k_d_max = max(this_k_d, k_d_max) # check consistency with best match (since the best match is excised in s-w code, and because ham is run with *one* k_v k_d set) if region not in best: best[region] = gene best[region + '_gl_seq'] = self.germline_seqs[region][ gene][glbounds[0]:glbounds[1]] best[region + '_qr_seq'] = query_seq[qrbounds[0]:qrbounds[1]] best[region + '_score'] = score if self.debug and n_skipped > 0: print '%8s skipped %d %s genes' % ('', n_skipped, region) for region in utils.regions: if region not in best: print ' no', region, 'match found for', query_name # NOTE if no d match found, we should really just assume entire d was eroded return # s-w allows d and j matches to overlap, so we need to apportion the disputed bases try: self.shift_overlapping_boundaries(all_query_bounds, all_germline_bounds, query_name, query_seq, best) except AssertionError: print '%s: apportionment failed' % query_name return # check for unproductive rearrangements for region in utils.regions: codon_positions[region] = utils.get_conserved_codon_position( self.cyst_positions, self.tryp_positions, region, best[region], all_germline_bounds[best[region]], all_query_bounds[best[region]], assert_on_fail=False ) # position in the query sequence, that is codons_ok = utils.check_both_conserved_codons(query_seq, codon_positions['v'], codon_positions['j'], debug=self.debug, extra_str=' ', assert_on_fail=False) cdr3_length = codon_positions['j'] - codon_positions['v'] + 3 in_frame_cdr3 = (cdr3_length % 3 == 0) if self.debug and not in_frame_cdr3: print ' out of frame cdr3: %d %% 3 = %d' % (cdr3_length, cdr3_length % 3) no_stop_codon = utils.stop_codon_check(query_seq, codon_positions['v'], debug=self.debug) if not codons_ok or not in_frame_cdr3 or not no_stop_codon: if self.debug: print ' unproductive rearrangement in waterer codons_ok: %s in_frame_cdr3: %s no_stop_codon: %s' % ( codons_ok, in_frame_cdr3, no_stop_codon) if self.args.skip_unproductive: if self.debug: print ' ...skipping' self.n_unproductive += 1 self.info['skipped_unproductive_queries'].append(query_name) return # best k_v, k_d: k_v = all_query_bounds[best['v']][1] # end of v match k_d = all_query_bounds[best['d']][1] - all_query_bounds[best['v']][ 1] # end of d minus end of v if k_d_max < 5: # since the s-w step matches to the longest possible j and then excises it, this sometimes gobbles up the d, resulting in a very short d alignment. if self.debug: print ' expanding k_d' k_d_max = max(8, k_d_max) if 'IGHJ4*' in best['j'] and self.germline_seqs['d'][best['d']][ -5:] == 'ACTAC': # the end of some d versions is the same as the start of some j versions, so the s-w frequently kicks out the 'wrong' alignment if self.debug: print ' doubly expanding k_d' if k_d_max - k_d_min < 8: k_d_min -= 5 k_d_max += 2 k_v_min = max( 0, k_v_min - self.args.default_v_fuzz ) # ok, so I don't *actually* want it to be zero... oh, well k_v_max += self.args.default_v_fuzz k_d_min = max(1, k_d_min - self.args.default_d_fuzz) k_d_max += self.args.default_d_fuzz assert k_v_min > 0 and k_d_min > 0 and k_v_max > 0 and k_d_max > 0 if self.debug: print ' k_v: %d [%d-%d)' % (k_v, k_v_min, k_v_max) print ' k_d: %d [%d-%d)' % (k_d, k_d_min, k_d_max) print ' used', for region in utils.regions: print ' %s: %d/%d' % (region, n_used[region], n_matches[region]), print '' kvals = {} kvals['v'] = {'best': k_v, 'min': k_v_min, 'max': k_v_max} kvals['d'] = {'best': k_d, 'min': k_d_min, 'max': k_d_max} self.add_to_info(query_name, query_seq, kvals, match_names, best, all_germline_bounds, all_query_bounds, codon_positions=codon_positions)
def process_query(self, bam, reads): primary = next((r for r in reads if not r.is_secondary), None) query_seq = primary.seq query_name = primary.qname first_match_query_bounds = None # since sw excises its favorite v match, we have to know this match's boundaries in order to calculate k_d for all the other matches all_match_names = {} warnings = {} # ick, this is a messy way to pass stuff around for region in utils.regions: all_match_names[region] = [] all_query_bounds, all_germline_bounds = {}, {} n_skipped_invalid_cpos = 0 for read in reads: # loop over the matches found for each query sequence # set this match's values read.seq = query_seq # only the first one has read.seq set by default, so we need to set the rest by hand gene = bam.references[read.tid] region = utils.get_region(gene) raw_score = read.tags[0][1] # raw because they don't include the gene choice probs score = raw_score if self.args.apply_choice_probs_in_sw: # NOTE I stopped applying the gene choice probs here because the smith-waterman scores don't correspond to log-probs, so throwing on the gene choice probs was dubious (and didn't seem to work that well) score = self.get_choice_prob(region, gene) * raw_score # multiply by the probability to choose this gene qrbounds = (read.qstart, read.qend) glbounds = (read.pos, read.aend) if region == 'v' and first_match_query_bounds is None: first_match_query_bounds = qrbounds # perform a few checks and see if we want to skip this match if region == 'v': # skip matches with cpos past the end of the query seq (i.e. eroded a ton on the right side of the v) cpos = utils.get_conserved_codon_position(self.cyst_positions, self.tryp_positions, 'v', gene, glbounds, qrbounds, assert_on_fail=False) if not utils.check_conserved_cysteine(self.germline_seqs['v'][gene], self.cyst_positions[gene]['cysteine-position'], assert_on_fail=False): # some of the damn cysteine positions in the json file were wrong, so now we check raise Exception('bad cysteine in %s: %d %s' % (gene, self.cyst_positions[gene]['cysteine-position'], self.germline_seqs['v'][gene])) if cpos < 0 or cpos >= len(query_seq): n_skipped_invalid_cpos += 1 continue if 'I' in read.cigarstring or 'D' in read.cigarstring: # skip indels, and tell the HMM to skip indels (you won't see any unless you decrease the <self.args.gap_open_penalty>) if len(all_match_names[region]) == 0: # if this is the first (best) match for this region, allow indels (otherwise skip the match) if query_name not in self.info['indels']: self.info['indels'][query_name] = self.get_indel_info(query_name, read.cigarstring, query_seq[qrbounds[0] : qrbounds[1]], self.germline_seqs[region][gene][glbounds[0] : glbounds[1]], gene) self.info['indels'][query_name]['reversed_seq'] = query_seq[ : qrbounds[0]] + self.info['indels'][query_name]['reversed_seq'] + query_seq[qrbounds[1] : ] self.new_indels += 1 # print ' query seq %s' % query_seq # print 'indelfo seq %s' % self.info['indels'][query_name]['reversed_seq'] # self.info['skipped_indel_queries'].append(query_name) # self.info[query_name] = {'indels'} else: print ' multiple indels for %s' % query_name return else: continue if qrbounds[1]-qrbounds[0] != glbounds[1]-glbounds[0]: raise Exception('germline match (%d %d) not same length as query match (%d %d)' % (qrbounds[0], qrbounds[1], glbounds[0], glbounds[1])) assert qrbounds[1] <= len(query_seq) if glbounds[1] > len(self.germline_seqs[region][gene]): print ' ', gene print ' ', glbounds[1], len(self.germline_seqs[region][gene]) print ' ', self.germline_seqs[region][gene] assert glbounds[1] <= len(self.germline_seqs[region][gene]) assert qrbounds[1]-qrbounds[0] == glbounds[1]-glbounds[0] # and finally add this match's information warnings[gene] = '' all_match_names[region].append((score, gene)) # NOTE it is important that this is ordered such that the best match is first all_query_bounds[gene] = qrbounds all_germline_bounds[gene] = glbounds # if n_skipped_invalid_cpos > 0: # print ' skipped %d invalid cpos values for %s' % (n_skipped_invalid_cpos, query_name) self.summarize_query(query_name, query_seq, all_match_names, all_query_bounds, all_germline_bounds, warnings, first_match_query_bounds)
def process_query(self, bam, reads): primary = next((r for r in reads if not r.is_secondary), None) query_seq = primary.seq query_name = primary.qname first_match_query_bounds = None # since sw excises its favorite v match, we have to know this match's boundaries in order to calculate k_d for all the other matches all_match_names = {} warnings = {} # ick, this is a messy way to pass stuff around for region in utils.regions: all_match_names[region] = [] all_query_bounds, all_germline_bounds = {}, {} n_skipped_invalid_cpos = 0 for read in reads: # loop over the matches found for each query sequence # set this match's values read.seq = query_seq # only the first one has read.seq set by default, so we need to set the rest by hand gene = bam.references[read.tid] region = utils.get_region(gene) raw_score = read.tags[0][ 1] # raw because they don't include the gene choice probs score = raw_score if self.args.apply_choice_probs_in_sw: # NOTE I stopped applying the gene choice probs here because the smith-waterman scores don't correspond to log-probs, so throwing on the gene choice probs was dubious (and didn't seem to work that well) score = self.get_choice_prob( region, gene ) * raw_score # multiply by the probability to choose this gene qrbounds = (read.qstart, read.qend) glbounds = (read.pos, read.aend) if region == 'v' and first_match_query_bounds is None: first_match_query_bounds = qrbounds # perform a few checks and see if we want to skip this match if region == 'v': # skip matches with cpos past the end of the query seq (i.e. eroded a ton on the right side of the v) cpos = utils.get_conserved_codon_position(self.cyst_positions, self.tryp_positions, 'v', gene, glbounds, qrbounds, assert_on_fail=False) if not utils.check_conserved_cysteine( self.germline_seqs['v'][gene], self.cyst_positions[gene]['cysteine-position'], assert_on_fail=False ): # some of the damn cysteine positions in the json file were wrong, so now we check raise Exception( 'bad cysteine in %s: %d %s' % (gene, self.cyst_positions[gene]['cysteine-position'], self.germline_seqs['v'][gene])) if cpos < 0 or cpos >= len(query_seq): n_skipped_invalid_cpos += 1 continue if 'I' in read.cigarstring or 'D' in read.cigarstring: # skip indels, and tell the HMM to skip indels (you won't see any unless you decrease the <self.args.gap_open_penalty>) if len( all_match_names[region] ) == 0: # if this is the first (best) match for this region, allow indels (otherwise skip the match) if query_name not in self.info['indels']: self.info['indels'][query_name] = self.get_indel_info( query_name, read.cigarstring, query_seq[qrbounds[0]:qrbounds[1]], self.germline_seqs[region][gene] [glbounds[0]:glbounds[1]], gene) self.info['indels'][query_name][ 'reversed_seq'] = query_seq[:qrbounds[ 0]] + self.info['indels'][query_name][ 'reversed_seq'] + query_seq[qrbounds[1]:] self.new_indels += 1 # print ' query seq %s' % query_seq # print 'indelfo seq %s' % self.info['indels'][query_name]['reversed_seq'] # self.info['skipped_indel_queries'].append(query_name) # self.info[query_name] = {'indels'} else: print ' multiple indels for %s' % query_name return else: continue if qrbounds[1] - qrbounds[0] != glbounds[1] - glbounds[0]: raise Exception( 'germline match (%d %d) not same length as query match (%d %d)' % (qrbounds[0], qrbounds[1], glbounds[0], glbounds[1])) assert qrbounds[1] <= len(query_seq) if glbounds[1] > len(self.germline_seqs[region][gene]): print ' ', gene print ' ', glbounds[1], len(self.germline_seqs[region][gene]) print ' ', self.germline_seqs[region][gene] assert glbounds[1] <= len(self.germline_seqs[region][gene]) assert qrbounds[1] - qrbounds[0] == glbounds[1] - glbounds[0] # and finally add this match's information warnings[gene] = '' all_match_names[region].append( (score, gene) ) # NOTE it is important that this is ordered such that the best match is first all_query_bounds[gene] = qrbounds all_germline_bounds[gene] = glbounds # if n_skipped_invalid_cpos > 0: # print ' skipped %d invalid cpos values for %s' % (n_skipped_invalid_cpos, query_name) self.summarize_query(query_name, query_seq, all_match_names, all_query_bounds, all_germline_bounds, warnings, first_match_query_bounds)