def get_pos_in_alignment(codon, aligned_seq, seq, pos): """ given <pos> in <seq>, find the codon's position in <aligned_seq> """ assert utils.codon_ok( codon, seq, pos, debug=debug ) # this only gets called on the gene with the *known* position, so it shouldn't fail pos_in_alignment = pos + get_n_gaps_up_to_pos(aligned_seq, pos) assert utils.codon_ok(codon, aligned_seq, pos_in_alignment, debug=debug) return pos_in_alignment
def generate_snpd_gene(gene, cpos, seq, positions): assert utils.get_region(gene) == "v" # others not yet handled def choose_position(): snp_pos = None while snp_pos is None or snp_pos in snpd_positions or not utils.codon_ok("cyst", tmpseq, cpos, debug=True): snp_pos = random.randint(10, len(seq) - 15) # note that randint() is inclusive tmpseq = seq[:snp_pos] + "X" + seq[snp_pos + 1 :] # for checking cyst position return snp_pos snpd_positions = set() # only used if a position wasn't specified (i.e. was None) in <snps_to_add> mutfo = OrderedDict() for snp_pos in positions: if snp_pos is None: snp_pos = choose_position() snpd_positions.add(snp_pos) new_base = None while new_base is None or new_base == seq[snp_pos]: new_base = utils.nukes[random.randint(0, len(utils.nukes) - 1)] print " %3d %s --> %s" % (snp_pos, seq[snp_pos], new_base) mutfo[snp_pos] = {"original": seq[snp_pos], "new": new_base} seq = seq[:snp_pos] + new_base + seq[snp_pos + 1 :] assert utils.codon_ok("cyst", seq, cpos, debug=True) # this is probably unnecessary snpd_name, mutfo = get_new_allele_name_and_change_mutfo(gene, mutfo) return {"template-gene": gene, "gene": snpd_name, "seq": seq}
def revert_conserved_codons(self, seq): """ revert conserved cysteine and tryptophan to their original bases, eg if they were messed up by s.h.m. """ for region, pos in self.final_codon_positions.items(): if seq[pos : pos + 3] != self.unmutated_codons[region]: assert len(self.unmutated_codons[region]) == 3 seq = seq[:pos] + self.unmutated_codons[region] + seq[pos + 3 :] assert utils.codon_ok(utils.conserved_codons[self.glfo['chain']][region], seq, pos) return seq
def remove_v_genes_with_bad_cysteines(glfo, debug=False): prelength = len(glfo["seqs"]["v"]) for gene in glfo["seqs"]["v"].keys(): # if len(glfo['seqs']['v'][gene]) < glfo['cyst-positions'][gene] + 3: if not utils.codon_ok("cyst", glfo["seqs"]["v"][gene], glfo["cyst-positions"][gene]): remove_gene(glfo, gene, debug=debug) if True: # debug: print " removed %d / %d v genes with bad cysteines" % ( prelength - len(glfo["seqs"]["v"]), len(glfo["seqs"]["v"]), )
def choose_position(): snp_pos = None while snp_pos is None or snp_pos in snpd_positions or not utils.codon_ok("cyst", tmpseq, cpos, debug=True): snp_pos = random.randint(10, len(seq) - 15) # note that randint() is inclusive tmpseq = seq[:snp_pos] + "X" + seq[snp_pos + 1 :] # for checking cyst position return snp_pos
def get_missing_codon_info(glfo, debug=False): # ---------------------------------------------------------------------------------------- def get_n_gaps_up_to_pos(aligned_seq, pos): # NOTE I think this duplicates the functionality of count_gaps() """ return number of gapped positions in <aligned_seq> before <pos> """ ipos = 0 # position in unaligned sequence n_gaps_passed = ( 0 ) # number of gapped positions in the aligned sequence that we pass before getting to <pos> (i.e. while ipos < pos) while ipos < pos: if aligned_seq[ipos + n_gaps_passed] in utils.gap_chars: n_gaps_passed += 1 else: ipos += 1 return n_gaps_passed # ---------------------------------------------------------------------------------------- def get_pos_in_alignment(codon, aligned_seq, seq, pos): """ given <pos> in <seq>, find the codon's position in <aligned_seq> """ assert utils.codon_ok( codon, seq, pos, debug=debug ) # this only gets called on the gene with the *known* position, so it shouldn't fail pos_in_alignment = pos + get_n_gaps_up_to_pos(aligned_seq, pos) assert utils.codon_ok(codon, aligned_seq, pos_in_alignment, debug=debug) return pos_in_alignment for region, codon in utils.conserved_codons[glfo["chain"]].items(): missing_genes = set(glfo["seqs"][region]) - set(glfo[codon + "-positions"]) if len(missing_genes) == 0: if debug: print " no missing %s info" % codon continue if debug: print " missing %d %s positions" % (len(missing_genes), codon) aligned_seqs = get_new_alignments(glfo, region, debug=debug) # for g, s in aligned_seqs.items(): # print s, utils.color_gene(g) # if region == 'j': # raise Exception('missing tryp position for %s, and we can\'t infer it because tryp positions don\'t reliably align to the same position' % ' '.join(missing_genes)) # existing codon position (this assumes that once aligned, all genes have the same codon position -- which is only really true for the imgt-gapped alignment) if len(glfo[codon + "-positions"]) > 0: known_gene, known_pos = None, None for gene, pos in glfo[ codon + "-positions" ].items(): # take the first one for which we have the sequence (NOTE it would be safer to check that they're all the same) if ( gene in glfo["seqs"][region] and gene in aligned_seqs and utils.codon_ok(codon, glfo["seqs"][region][gene], pos) ): known_gene, known_pos = gene, pos break if known_gene is None: raise Exception("couldn't find a known %s position" % codon) # NOTE for cyst, should be 309 if alignments are imgt [which they used to usually be, but now probably aren't] (imgt says 104th codon --> subtract 1 to get zero-indexing, then multiply by three 3 * (104 - 1) = 309 known_pos_in_alignment = get_pos_in_alignment( codon, aligned_seqs[known_gene], glfo["seqs"][region][known_gene], known_pos ) if debug: print " using known position %d (aligned %d) in %s" % ( known_pos, known_pos_in_alignment, known_gene, ) elif codon == "cyst": known_pos_in_alignment = 309 print " assuming aligned %s position is %d (this will %s work if you're using imgt alignments)" % ( codon, known_pos_in_alignment, utils.color("red", "only"), ) raise Exception("not really using imgt alignments much any more, so this isn't really going to work") else: raise Exception("no existing %s info, and couldn't guess it, either" % codon) n_added = 0 seqons = [] # (seq, pos) pairs for gene in missing_genes: unaligned_pos = known_pos_in_alignment - utils.count_gaps(aligned_seqs[gene], istop=known_pos_in_alignment) seq_to_check = glfo["seqs"][region][gene] seqons.append((seq_to_check, unaligned_pos)) glfo[codon + "-positions"][gene] = unaligned_pos n_added += 1 # if debug: # tmpseq = aligned_seqs[gene] # tmppos = known_pos_in_alignment # print ' %s%s%s %s (new)' % (tmpseq[:tmppos], utils.color('reverse_video', tmpseq[tmppos : tmppos + 3]), tmpseq[tmppos + 3:], utils.color_gene(gene)) utils.check_a_bunch_of_codons(codon, seqons, extra_str=" ", debug=debug) if debug: print " added %d %s positions" % (n_added, codon)