def desc_variants(coding_seq1, coding_seq2): var_description = "" assert len(coding_seq1) % 3 == 0, ( "Reference coding sequence malformed: " + "should have a length that is a multiple of 3! " + "DNA sequence is: " + coding_seq1 ) aa1 = list(translate(coding_seq1)) for i in range(len(aa1) - 1): assert aa1[i] != "*", ( "Reference coding sequence malformed: only " + "last codon should be stop codon! AA sequence is: " + "".join(aa1) + " DNA sequence is: " + coding_seq1 ) assert aa1[-1] == "*", ( "Reference coding sequence malformed: last " + "codon should be a stop codon! AA sequence is: " + "".join(aa1) + " DNA sequence is: " + coding_seq1 ) if len(coding_seq2) % 3 != 0: # Frameshift. Find first amino acid that is changed. coding_seq2_trimmed = coding_seq2[0 : 3 * (len(coding_seq2) / 3)] aa2 = list(translate(coding_seq2)) pos = 1 while pos <= len(aa1) and pos <= len(aa2) and aa1[pos - 1] == aa2[pos - 1]: pos += 1 if pos <= len(aa1): var_description = aa1[pos - 1] + str(pos) + "Shift" else: aa2 = list(translate(coding_seq2)) position = 1 last_ref_aa = "" while len(aa1) > 0 and len(aa2) > 0 and aa1[0] == aa2[0]: last_ref_aa = aa1.pop(0) aa2.pop(0) position += 1 while len(aa1) > 0 and len(aa2) > 0 and aa1[-1] == aa2[-1]: aa1.pop(-1) aa2.pop(-1) if len(aa1) == 0 and len(aa2) == 0: # no change pass elif len(aa1) == 0: # pure insertion if position > 1: # ignore if before 1st AA, shouldn't get translated # search for stop in aa2 -- don't want to report beyond this if any(["*" in aa for aa in aa2]): report_aa2 = [] for i in range(len(aa2)): report_aa2.append(aa2[i]) if aa2[i] == "*": break var_description = last_ref_aa + str(position - 1) + "".join([last_ref_aa] + report_aa2) # report aa just before insert, pos of that aa, repeat that aa and add the insert else: var_description = last_ref_aa + str(position - 1) + "".join([last_ref_aa] + aa2) elif len(aa2) == 0: # pure deletion, report all of aa1, first pos of aa1, "Del" var_description = "".join(aa1) + str(position) + "Del" else: # search for stop -- don't want to report beyond this if any(["*" in aa for aa in aa2]): report_aa1 = [] report_aa2 = [] for i in range(len(aa2)): report_aa2.append(aa2[i]) if len(aa1) >= i + 1: report_aa1.append(aa1[i]) if aa2[i] == "*": break var_description = "".join(report_aa1) + str(position) + "".join(report_aa2) # No stop -- report all of aa1, first pos of aa1, all of aa2 else: var_description = "".join(aa1) + str(position) + "".join(aa2) return var_description
def infer_function(twobit_file, record, geneName, strand, cdsStart, cdsEnd, exonStarts, exonEnds): """ Infer "function" (as dbSNP calls it) given a reference TwoBitFile object, a GFFRecord object, and info about the gene: name, strand, coding sequence start, coding sequence end (both 0-based, half-open), exon starts (comma- separated string), and exon ends (comma-separated string). Returns a tuple consisting of the "function" (coding, 5'-UTR, etc.), followed by the exon or intron number (1-based, if applicable), and amino acid residue (1-based numeric type, if applicable) or change (1-based string, if applicable). """ # we're done if it's not intronic or exonic if (record.strand == "+" and record.start <= cdsStart) or \ (record.strand == "-" and record.end > cdsEnd): return ("5'-UTR",) if (record.strand == "+" and record.end > cdsEnd) or \ (record.strand == "-" and record.start <= cdsStart): return ("3'-UTR",) # make exonStarts and exonEnds into lists # first, we have to make sure they're strings... try: exonStarts = exonStarts.tostring() exonEnds = exonEnds.tostring() # if we already have a string, tostring() won't work except AttributeError: pass # now, we really make them lists exonStarts = [long(e) for e in exonStarts.strip(",").split(",")] exonEnds = [long(e) for e in exonEnds.strip(",").split(",")] # make a list of all exons, in case we need it all_exons = zip(exonStarts, exonEnds) # reverse for strand; note how we set aside all_exons first before doing this if strand == "-": exonStarts.reverse() exonEnds.reverse() # parse out exons exons = [] running_intron_count = running_exon_count = running_cds_bases_count = 0 # 1-based for j in range(0, len(exonStarts)): # discard any non-coding portions with this if statement if exonEnds[j] > cdsStart and exonStarts[j] <= cdsEnd: # trim the start and end to the coding region if exonStarts[j] < cdsStart: exonStarts[j] = cdsStart if exonEnds[j] > cdsEnd: exonEnds[j] = cdsEnd # increment the count running_exon_count += 1 # look at the intron, if applicable if len(exons) > 0: if strand == "+": intron_start = exons[-1][1] # intron_start = exons[-1][1] - 1 # the end of the last exon considered intron_end = exonStarts[j] else: intron_start = exonEnds[j] intron_end = exons[-1][0] running_intron_count += 1 # test if is in intron (remember, start and end are 1-based) # (this only works if record.start = record.end (i.e. SNPs); # otherwise, this will need to be adapted by taking strand # into account) if (record.start > intron_start and record.end <= intron_end): return ("intron", running_intron_count) # look at exon (again, this only works if record.start = record.end # and assumes both are 1-based) if (record.start > exonStarts[j] and record.end <= exonEnds[j]): # figure out number of bases, amino acid residues, frame if strand == "+": running_cds_bases_count += record.start - exonStarts[j] frame_offset = running_cds_bases_count % 3 if frame_offset == 0: frame_offset = 3 # chr direction => # translation direction => # ------------- # | 1 | 2 | 3 | # ------------- # ^ first base of codon # # note that this convention corresponds to frames 0, 2, 1 # respectively in GTF notation else: running_cds_bases_count += exonEnds[j] + 1 - record.end frame_offset = -1 * (running_cds_bases_count % 3) if frame_offset == 0: frame_offset = -3 # chr direction => # <= translation direction # ------------- # |-3 |-2 |-1 | # ------------- # ^ first base of codon # # note that this convention corresponds to frames 1, 2, 0 # respectively in GTF notation # ugly, but that's the way it is, we want to divide by 3, then take the ceiling # as a (long) integer; so, we convert to float, divide, take the ceiling, then # convert back... amino_acid_residue = long(math.ceil(float(running_cds_bases_count) / 3)) # figure out what we need, and prepare to look it up start_exon, end_exon, intervals = \ codon_intersect(record.start - 1, record.end, all_exons, frame_offset) # calculate the chromosome name we want to use if record.seqname.startswith("chr"): chr = record.seqname else: chr = "chr" + record.seqname # look it up ref_seq = "".join([twobit_file[chr][k[0]:k[1]] for k in intervals]) # within each set of intervals, the same codons could have # different positions for alternative splicings, etc. replacement_coord = (frame_offset + 3) % 4 # figure out which allele is not the mutant alleles = record.attributes["alleles"].strip("\"").split("/") try: alleles.remove(record.attributes["ref_allele"]) except ValueError: pass # now work through each mutant allele amino_acid_changes = [] is_synonymous = True for mut_allele in alleles: mut_seq_list = list(ref_seq) mut_seq_list[replacement_coord] = mut_allele mut_seq = "".join(mut_seq_list) if frame_offset > 0 and not chr.startswith("chrM"): ref_residue = translate(ref_seq) mut_residue = translate(mut_seq) elif frame_offset < 0 and not chr.startswith("chrM"): ref_residue = translate(reverse_complement(ref_seq)) mut_residue = translate(reverse_complement(mut_seq)) elif frame_offset > 0: ref_residue = translate(ref_seq, "Vertebrate Mitochondrial") mut_residue = translate(mut_seq, "Vertebrate Mitochondrial") else: ref_residue = translate(reverse_complement(ref_seq), "Vertebrate Mitochondrial") mut_residue = translate(reverse_complement(mut_seq), "Vertebrate Mitochondrial") if ref_residue != mut_residue: amino_acid_changes.append(ref_residue + str(amino_acid_residue) + mut_residue) is_synonymous = False # return info if not is_synonymous: return ("nonsynonymous coding", running_exon_count, " ".join(amino_acid_changes)) else: return ("synonymous coding", running_exon_count, amino_acid_residue) # otherwise, continue the bookkeeping running_cds_bases_count += exonEnds[j] - exonStarts[j] exons.append([exonStarts[j], exonEnds[j]])
def desc_variants(coding_seq1, coding_seq2): var_description = "" assert len(coding_seq1) % 3 == 0, "Reference coding sequence malformed: " \ + "should have a length that is a multiple of 3! " \ + "DNA sequence is: " + coding_seq1 aa1 = list(translate(coding_seq1)) for i in range(len(aa1) - 1): assert aa1[i] != "*", "Reference coding sequence malformed: only " \ + "last codon should be stop codon! AA sequence is: " + "".join(aa1) \ + " DNA sequence is: " + coding_seq1 assert aa1[-1] == "*", "Reference coding sequence malformed: last " \ + "codon should be a stop codon! AA sequence is: " + "".join(aa1) \ + " DNA sequence is: " + coding_seq1 if (len(coding_seq2) % 3 != 0): # Frameshift. Find first amino acid that is changed. coding_seq2_trimmed = coding_seq2[0:3 * (len(coding_seq2) / 3)] aa2 = list(translate(coding_seq2)) pos = 1 while (pos <= len(aa1) and pos <= len(aa2) and aa1[pos - 1] == aa2[pos - 1]): pos += 1 if (pos <= len(aa1)): var_description = aa1[pos - 1] + str(pos) + "Shift" else: aa2 = list(translate(coding_seq2)) position = 1 last_ref_aa = "" while (len(aa1) > 0 and len(aa2) > 0 and aa1[0] == aa2[0]): last_ref_aa = aa1.pop(0) aa2.pop(0) position += 1 while (len(aa1) > 0 and len(aa2) > 0 and aa1[-1] == aa2[-1]): aa1.pop(-1) aa2.pop(-1) if len(aa1) == 0 and len(aa2) == 0: # no change pass elif len(aa1) == 0: # pure insertion if position > 1: # ignore if before 1st AA, shouldn't get translated # search for stop in aa2 -- don't want to report beyond this if any(["*" in aa for aa in aa2]): report_aa2 = [] for i in range(len(aa2)): report_aa2.append(aa2[i]) if aa2[i] == "*": break var_description = last_ref_aa + str( position - 1) + "".join([last_ref_aa] + report_aa2) # report aa just before insert, pos of that aa, repeat that aa and add the insert else: var_description = last_ref_aa + str( position - 1) + "".join([last_ref_aa] + aa2) elif len( aa2 ) == 0: # pure deletion, report all of aa1, first pos of aa1, "Del" var_description = "".join(aa1) + str(position) + "Del" else: # search for stop -- don't want to report beyond this if any(["*" in aa for aa in aa2]): report_aa1 = [] report_aa2 = [] for i in range(len(aa2)): report_aa2.append(aa2[i]) if len(aa1) >= i + 1: report_aa1.append(aa1[i]) if aa2[i] == "*": break var_description = "".join(report_aa1) + str( position) + "".join(report_aa2) # No stop -- report all of aa1, first pos of aa1, all of aa2 else: var_description = "".join(aa1) + str(position) + "".join(aa2) return var_description