def desc_variants(coding_seq1, coding_seq2):
    var_description = ""
    assert len(coding_seq1) % 3 == 0, (
        "Reference coding sequence malformed: "
        + "should have a length that is a multiple of 3! "
        + "DNA sequence is: "
        + coding_seq1
    )
    aa1 = list(translate(coding_seq1))
    for i in range(len(aa1) - 1):
        assert aa1[i] != "*", (
            "Reference coding sequence malformed: only "
            + "last codon should be stop codon! AA sequence is: "
            + "".join(aa1)
            + " DNA sequence is: "
            + coding_seq1
        )
    assert aa1[-1] == "*", (
        "Reference coding sequence malformed: last "
        + "codon should be a stop codon! AA sequence is: "
        + "".join(aa1)
        + " DNA sequence is: "
        + coding_seq1
    )
    if len(coding_seq2) % 3 != 0:
        # Frameshift. Find first amino acid that is changed.
        coding_seq2_trimmed = coding_seq2[0 : 3 * (len(coding_seq2) / 3)]
        aa2 = list(translate(coding_seq2))
        pos = 1
        while pos <= len(aa1) and pos <= len(aa2) and aa1[pos - 1] == aa2[pos - 1]:
            pos += 1
        if pos <= len(aa1):
            var_description = aa1[pos - 1] + str(pos) + "Shift"
    else:
        aa2 = list(translate(coding_seq2))
        position = 1
        last_ref_aa = ""
        while len(aa1) > 0 and len(aa2) > 0 and aa1[0] == aa2[0]:
            last_ref_aa = aa1.pop(0)
            aa2.pop(0)
            position += 1
        while len(aa1) > 0 and len(aa2) > 0 and aa1[-1] == aa2[-1]:
            aa1.pop(-1)
            aa2.pop(-1)
        if len(aa1) == 0 and len(aa2) == 0:  # no change
            pass
        elif len(aa1) == 0:  # pure insertion
            if position > 1:  # ignore if before 1st AA, shouldn't get translated
                # search for stop in aa2 -- don't want to report beyond this
                if any(["*" in aa for aa in aa2]):
                    report_aa2 = []
                    for i in range(len(aa2)):
                        report_aa2.append(aa2[i])
                        if aa2[i] == "*":
                            break
                    var_description = last_ref_aa + str(position - 1) + "".join([last_ref_aa] + report_aa2)
                # report aa just before insert, pos of that aa, repeat that aa and add the insert
                else:
                    var_description = last_ref_aa + str(position - 1) + "".join([last_ref_aa] + aa2)
        elif len(aa2) == 0:  # pure deletion, report all of aa1, first pos of aa1, "Del"
            var_description = "".join(aa1) + str(position) + "Del"
        else:
            # search for stop -- don't want to report beyond this
            if any(["*" in aa for aa in aa2]):
                report_aa1 = []
                report_aa2 = []
                for i in range(len(aa2)):
                    report_aa2.append(aa2[i])
                    if len(aa1) >= i + 1:
                        report_aa1.append(aa1[i])
                    if aa2[i] == "*":
                        break
                var_description = "".join(report_aa1) + str(position) + "".join(report_aa2)
            # No stop -- report all of aa1, first pos of aa1, all of aa2
            else:
                var_description = "".join(aa1) + str(position) + "".join(aa2)
    return var_description
def infer_function(twobit_file, record, geneName, strand, cdsStart, cdsEnd, exonStarts, exonEnds):
	"""
	Infer "function" (as dbSNP calls it) given a reference TwoBitFile object,
	a GFFRecord object, and info about the gene: name, strand, coding sequence
	start, coding sequence end (both 0-based, half-open), exon starts (comma-
	separated string), and exon ends (comma-separated string).
	
	Returns a tuple consisting of the "function" (coding, 5'-UTR, etc.),
	followed by the exon or intron number (1-based, if applicable), and amino
	acid residue (1-based numeric type, if applicable) or change (1-based
	string, if applicable).
	"""
	# we're done if it's not intronic or exonic
	if (record.strand == "+" and record.start <= cdsStart) or \
	  (record.strand == "-" and record.end > cdsEnd):
		return ("5'-UTR",)
	
	if (record.strand == "+" and record.end > cdsEnd) or \
	  (record.strand == "-" and record.start <= cdsStart):
		return ("3'-UTR",)
	
	# make exonStarts and exonEnds into lists
	# first, we have to make sure they're strings...
	try:
		exonStarts = exonStarts.tostring()
		exonEnds = exonEnds.tostring()
	# if we already have a string, tostring() won't work
	except AttributeError:
		pass
	
	# now, we really make them lists
	exonStarts = [long(e) for e in exonStarts.strip(",").split(",")]
	exonEnds = [long(e) for e in exonEnds.strip(",").split(",")]
	
	# make a list of all exons, in case we need it
	all_exons = zip(exonStarts, exonEnds)

	# reverse for strand; note how we set aside all_exons first before doing this
	if strand == "-":
		exonStarts.reverse()
		exonEnds.reverse()

	# parse out exons
	exons = []
	running_intron_count = running_exon_count = running_cds_bases_count = 0 # 1-based
	
	for j in range(0, len(exonStarts)):
		# discard any non-coding portions with this if statement
		if exonEnds[j] > cdsStart and exonStarts[j] <= cdsEnd:
			
			# trim the start and end to the coding region
			if exonStarts[j] < cdsStart:
				exonStarts[j] = cdsStart
			if exonEnds[j] > cdsEnd:
				exonEnds[j] = cdsEnd
			
			# increment the count
			running_exon_count += 1
			
			# look at the intron, if applicable
			if len(exons) > 0:
				if strand == "+":
					intron_start = exons[-1][1]
#					intron_start = exons[-1][1] - 1 # the end of the last exon considered
					intron_end = exonStarts[j]
				else:
					intron_start = exonEnds[j]
					intron_end = exons[-1][0]
				
				running_intron_count += 1
				
				# test if is in intron (remember, start and end are 1-based)
				# (this only works if record.start = record.end (i.e. SNPs);
				# otherwise, this will need to be adapted by taking strand
				# into account)
				if (record.start > intron_start and record.end <= intron_end):
					return ("intron", running_intron_count)
			
			# look at exon (again, this only works if record.start = record.end
			# and assumes both are 1-based)
			if (record.start > exonStarts[j] and record.end <= exonEnds[j]):
				# figure out number of bases, amino acid residues, frame
				if strand == "+":
					running_cds_bases_count += record.start - exonStarts[j]
					frame_offset = running_cds_bases_count % 3
					if frame_offset == 0:
						frame_offset = 3
						# chr direction =>
						# translation direction =>
						# -------------
						# | 1 | 2 | 3 |
						# -------------
						#   ^ first base of codon
						#
						# note that this convention corresponds to frames 0, 2, 1
						# respectively in GTF notation
				else:
					running_cds_bases_count += exonEnds[j] + 1 - record.end
					frame_offset = -1 * (running_cds_bases_count % 3)
					if frame_offset == 0:
						frame_offset = -3
						# chr direction =>
						# <= translation direction
						# -------------
						# |-3 |-2 |-1 |
						# -------------
						#           ^ first base of codon
						#
						# note that this convention corresponds to frames 1, 2, 0
						# respectively in GTF notation
				
				# ugly, but that's the way it is, we want to divide by 3, then take the ceiling
				# as a (long) integer; so, we convert to float, divide, take the ceiling, then
				# convert back...
				amino_acid_residue = long(math.ceil(float(running_cds_bases_count) / 3))
				
				# figure out what we need, and prepare to look it up
				start_exon, end_exon, intervals = \
				  codon_intersect(record.start - 1, record.end, all_exons, frame_offset)
				
				# calculate the chromosome name we want to use
				if record.seqname.startswith("chr"):
					chr = record.seqname
				else:
					chr = "chr" + record.seqname
				
				# look it up
				ref_seq = "".join([twobit_file[chr][k[0]:k[1]] for k in intervals])
				
				# within each set of intervals, the same codons could have
				# different positions for alternative splicings, etc.
				replacement_coord = (frame_offset + 3) % 4
				
				# figure out which allele is not the mutant
				alleles = record.attributes["alleles"].strip("\"").split("/")
				try:
					alleles.remove(record.attributes["ref_allele"])
				except ValueError:
					pass
				
				# now work through each mutant allele
				amino_acid_changes = []
				is_synonymous = True
				for mut_allele in alleles:				
					mut_seq_list = list(ref_seq)
					mut_seq_list[replacement_coord] = mut_allele
					mut_seq = "".join(mut_seq_list)
					
					if frame_offset > 0 and not chr.startswith("chrM"):
						ref_residue = translate(ref_seq)
						mut_residue = translate(mut_seq)
					elif frame_offset < 0 and not chr.startswith("chrM"):
						ref_residue = translate(reverse_complement(ref_seq))
						mut_residue = translate(reverse_complement(mut_seq))
					elif frame_offset > 0:
						ref_residue = translate(ref_seq, "Vertebrate Mitochondrial")
						mut_residue = translate(mut_seq, "Vertebrate Mitochondrial")
					else:
						ref_residue = translate(reverse_complement(ref_seq),
						  "Vertebrate Mitochondrial")
						mut_residue = translate(reverse_complement(mut_seq),
						  "Vertebrate Mitochondrial")
					
					if ref_residue != mut_residue:
						amino_acid_changes.append(ref_residue + str(amino_acid_residue) + mut_residue)
						is_synonymous = False
				
				# return info
				if not is_synonymous:
					return ("nonsynonymous coding", running_exon_count, " ".join(amino_acid_changes))
				else:
					return ("synonymous coding", running_exon_count, amino_acid_residue)
			
			# otherwise, continue the bookkeeping
			running_cds_bases_count += exonEnds[j] - exonStarts[j]					
			exons.append([exonStarts[j], exonEnds[j]])
def desc_variants(coding_seq1, coding_seq2):
    var_description = ""
    assert len(coding_seq1) % 3 == 0, "Reference coding sequence malformed: " \
            + "should have a length that is a multiple of 3! " \
            + "DNA sequence is: " + coding_seq1
    aa1 = list(translate(coding_seq1))
    for i in range(len(aa1) - 1):
        assert aa1[i] != "*", "Reference coding sequence malformed: only " \
            + "last codon should be stop codon! AA sequence is: " + "".join(aa1) \
            + " DNA sequence is: " + coding_seq1
    assert aa1[-1] == "*", "Reference coding sequence malformed: last " \
            + "codon should be a stop codon! AA sequence is: " + "".join(aa1) \
            + " DNA sequence is: " + coding_seq1
    if (len(coding_seq2) % 3 != 0):
        # Frameshift. Find first amino acid that is changed.
        coding_seq2_trimmed = coding_seq2[0:3 * (len(coding_seq2) / 3)]
        aa2 = list(translate(coding_seq2))
        pos = 1
        while (pos <= len(aa1) and pos <= len(aa2)
               and aa1[pos - 1] == aa2[pos - 1]):
            pos += 1
        if (pos <= len(aa1)):
            var_description = aa1[pos - 1] + str(pos) + "Shift"
    else:
        aa2 = list(translate(coding_seq2))
        position = 1
        last_ref_aa = ""
        while (len(aa1) > 0 and len(aa2) > 0 and aa1[0] == aa2[0]):
            last_ref_aa = aa1.pop(0)
            aa2.pop(0)
            position += 1
        while (len(aa1) > 0 and len(aa2) > 0 and aa1[-1] == aa2[-1]):
            aa1.pop(-1)
            aa2.pop(-1)
        if len(aa1) == 0 and len(aa2) == 0:  # no change
            pass
        elif len(aa1) == 0:  # pure insertion
            if position > 1:  # ignore if before 1st AA, shouldn't get translated
                # search for stop in aa2 -- don't want to report beyond this
                if any(["*" in aa for aa in aa2]):
                    report_aa2 = []
                    for i in range(len(aa2)):
                        report_aa2.append(aa2[i])
                        if aa2[i] == "*":
                            break
                    var_description = last_ref_aa + str(
                        position - 1) + "".join([last_ref_aa] + report_aa2)
                # report aa just before insert, pos of that aa, repeat that aa and add the insert
                else:
                    var_description = last_ref_aa + str(
                        position - 1) + "".join([last_ref_aa] + aa2)
        elif len(
                aa2
        ) == 0:  # pure deletion, report all of aa1, first pos of aa1, "Del"
            var_description = "".join(aa1) + str(position) + "Del"
        else:
            # search for stop -- don't want to report beyond this
            if any(["*" in aa for aa in aa2]):
                report_aa1 = []
                report_aa2 = []
                for i in range(len(aa2)):
                    report_aa2.append(aa2[i])
                    if len(aa1) >= i + 1:
                        report_aa1.append(aa1[i])
                    if aa2[i] == "*":
                        break
                var_description = "".join(report_aa1) + str(
                    position) + "".join(report_aa2)
            # No stop -- report all of aa1, first pos of aa1, all of aa2
            else:
                var_description = "".join(aa1) + str(position) + "".join(aa2)
    return var_description