if not data:
            raise ValueError

        # move on if there's no info on where on the genome it falls
        if data[1] == "--":

        # figure out the chromosome name
        chr = "chr%s" % (data[1])

        # figure out what those A's and B's mean
        # we must replace the "A" designation first, then "B", because
        # "A" is also an actual unambiguous nucleotide, while "B" is not
        actual_call = call.replace("A", data[4]).replace("B", data[5])
        if data[1] != "--" and data[3] == "-":
            actual_call = reverse_complement(actual_call)

        # prepare output
        out_line = chr
        out_line += "\taffx\tsnp\t"
        out_line += "%s\t%s" % (data[2], data[2]
                                )  # info from the affx databases is 1-based
        out_line += "\t.\t+\t.\t"

        if actual_call[0] != actual_call[1]:
            out_line += "alleles " + "/".join(sorted(list(actual_call)))
            out_line += "alleles " + actual_call[0]

        out_line += ";db_xref affx:%s dbsnp:%s" % (probe_set_id, data[0])
        out_line += ";ras_1 " + ras_1
def infer_function(twobit_file, record, geneName, strand, cdsStart, cdsEnd, exonStarts, exonEnds):
	Infer "function" (as dbSNP calls it) given a reference TwoBitFile object,
	a GFFRecord object, and info about the gene: name, strand, coding sequence
	start, coding sequence end (both 0-based, half-open), exon starts (comma-
	separated string), and exon ends (comma-separated string).
	Returns a tuple consisting of the "function" (coding, 5'-UTR, etc.),
	followed by the exon or intron number (1-based, if applicable), and amino
	acid residue (1-based numeric type, if applicable) or change (1-based
	string, if applicable).
	# we're done if it's not intronic or exonic
	if (record.strand == "+" and record.start <= cdsStart) or \
	  (record.strand == "-" and record.end > cdsEnd):
		return ("5'-UTR",)
	if (record.strand == "+" and record.end > cdsEnd) or \
	  (record.strand == "-" and record.start <= cdsStart):
		return ("3'-UTR",)
	# make exonStarts and exonEnds into lists
	# first, we have to make sure they're strings...
		exonStarts = exonStarts.tostring()
		exonEnds = exonEnds.tostring()
	# if we already have a string, tostring() won't work
	except AttributeError:
	# now, we really make them lists
	exonStarts = [long(e) for e in exonStarts.strip(",").split(",")]
	exonEnds = [long(e) for e in exonEnds.strip(",").split(",")]
	# make a list of all exons, in case we need it
	all_exons = zip(exonStarts, exonEnds)

	# reverse for strand; note how we set aside all_exons first before doing this
	if strand == "-":

	# parse out exons
	exons = []
	running_intron_count = running_exon_count = running_cds_bases_count = 0 # 1-based
	for j in range(0, len(exonStarts)):
		# discard any non-coding portions with this if statement
		if exonEnds[j] > cdsStart and exonStarts[j] <= cdsEnd:
			# trim the start and end to the coding region
			if exonStarts[j] < cdsStart:
				exonStarts[j] = cdsStart
			if exonEnds[j] > cdsEnd:
				exonEnds[j] = cdsEnd
			# increment the count
			running_exon_count += 1
			# look at the intron, if applicable
			if len(exons) > 0:
				if strand == "+":
					intron_start = exons[-1][1]
#					intron_start = exons[-1][1] - 1 # the end of the last exon considered
					intron_end = exonStarts[j]
					intron_start = exonEnds[j]
					intron_end = exons[-1][0]
				running_intron_count += 1
				# test if is in intron (remember, start and end are 1-based)
				# (this only works if record.start = record.end (i.e. SNPs);
				# otherwise, this will need to be adapted by taking strand
				# into account)
				if (record.start > intron_start and record.end <= intron_end):
					return ("intron", running_intron_count)
			# look at exon (again, this only works if record.start = record.end
			# and assumes both are 1-based)
			if (record.start > exonStarts[j] and record.end <= exonEnds[j]):
				# figure out number of bases, amino acid residues, frame
				if strand == "+":
					running_cds_bases_count += record.start - exonStarts[j]
					frame_offset = running_cds_bases_count % 3
					if frame_offset == 0:
						frame_offset = 3
						# chr direction =>
						# translation direction =>
						# -------------
						# | 1 | 2 | 3 |
						# -------------
						#   ^ first base of codon
						# note that this convention corresponds to frames 0, 2, 1
						# respectively in GTF notation
					running_cds_bases_count += exonEnds[j] + 1 - record.end
					frame_offset = -1 * (running_cds_bases_count % 3)
					if frame_offset == 0:
						frame_offset = -3
						# chr direction =>
						# <= translation direction
						# -------------
						# |-3 |-2 |-1 |
						# -------------
						#           ^ first base of codon
						# note that this convention corresponds to frames 1, 2, 0
						# respectively in GTF notation
				# ugly, but that's the way it is, we want to divide by 3, then take the ceiling
				# as a (long) integer; so, we convert to float, divide, take the ceiling, then
				# convert back...
				amino_acid_residue = long(math.ceil(float(running_cds_bases_count) / 3))
				# figure out what we need, and prepare to look it up
				start_exon, end_exon, intervals = \
				  codon_intersect(record.start - 1, record.end, all_exons, frame_offset)
				# calculate the chromosome name we want to use
				if record.seqname.startswith("chr"):
					chr = record.seqname
					chr = "chr" + record.seqname
				# look it up
				ref_seq = "".join([twobit_file[chr][k[0]:k[1]] for k in intervals])
				# within each set of intervals, the same codons could have
				# different positions for alternative splicings, etc.
				replacement_coord = (frame_offset + 3) % 4
				# figure out which allele is not the mutant
				alleles = record.attributes["alleles"].strip("\"").split("/")
				except ValueError:
				# now work through each mutant allele
				amino_acid_changes = []
				is_synonymous = True
				for mut_allele in alleles:				
					mut_seq_list = list(ref_seq)
					mut_seq_list[replacement_coord] = mut_allele
					mut_seq = "".join(mut_seq_list)
					if frame_offset > 0 and not chr.startswith("chrM"):
						ref_residue = translate(ref_seq)
						mut_residue = translate(mut_seq)
					elif frame_offset < 0 and not chr.startswith("chrM"):
						ref_residue = translate(reverse_complement(ref_seq))
						mut_residue = translate(reverse_complement(mut_seq))
					elif frame_offset > 0:
						ref_residue = translate(ref_seq, "Vertebrate Mitochondrial")
						mut_residue = translate(mut_seq, "Vertebrate Mitochondrial")
						ref_residue = translate(reverse_complement(ref_seq),
						  "Vertebrate Mitochondrial")
						mut_residue = translate(reverse_complement(mut_seq),
						  "Vertebrate Mitochondrial")
					if ref_residue != mut_residue:
						amino_acid_changes.append(ref_residue + str(amino_acid_residue) + mut_residue)
						is_synonymous = False
				# return info
				if not is_synonymous:
					return ("nonsynonymous coding", running_exon_count, " ".join(amino_acid_changes))
					return ("synonymous coding", running_exon_count, amino_acid_residue)
			# otherwise, continue the bookkeeping
			running_cds_bases_count += exonEnds[j] - exonStarts[j]					
			exons.append([exonStarts[j], exonEnds[j]])
if __name__ == "__main__":
def main():
	# parse options
	option, args = doc_optparse.parse(__doc__)
	if len(args) < 2:
	flank = int(option.flank or 0)
	# try opening the file both ways, in case the arguments got confused
		gff_file = gff.input(args[1])
		twobit_file = twobit.input(args[0])
	except Exception:
		gff_file = gff.input(args[0])
		twobit_file = twobit.input(args[1])
	# initialize a set of variables to keep track of uniqueness, if we need them
	if option.unique:
		previous_record = None
		previous_ref_seq = None
		repetition_count = 1
	for record in gff_file:
		# if we're using the unique option, output the previous record only when
		# we're sure we've seen all repetitions of it
		if option.unique and record == previous_record:
			repetition_count += 1
		elif option.unique:
			if previous_record:
				previous_record.attributes["repetition_count"] = str(repetition_count)
				print FastaRecord(str(previous_record).replace("\t", "|"), previous_ref_seq)
			repetition_count = 1
			previous_record = record

		if record.seqname.startswith("chr"):
			chr = record.seqname
			chr = "chr" + record.seqname
		ref_seq = twobit_file[chr][(record.start - 1):record.end]

		if flank != 0:
			# calculate the flanks (these variables are 0-based)
			left_flank_start = record.start - flank - 1
			left_flank_end = record.start - 1
			if left_flank_start < 0:
				left_flank_start = 0
			right_flank_start = record.end
			right_flank_end = record.end + flank
			# now find them
			left_flank_seq = twobit_file[chr][left_flank_start:left_flank_end]
			right_flank_seq = twobit_file[chr][right_flank_start:right_flank_end]
			ref_seq = left_flank_seq + "\n\n" + ref_seq + "\n\n" + right_flank_seq
		if option.strand and record.strand == "-":
			ref_seq = reverse_complement(ref_seq)
		# we don't output the current record if we're using the unique option
		if option.unique:
			previous_ref_seq = ref_seq
			print FastaRecord(str(record).replace("\t", "|"), ref_seq)
	# we'll have one last record yet to output if we used the unique option
	if option.unique:
		previous_record.attributes["repetition_count"] = str(repetition_count)
		print FastaRecord(str(previous_record).replace("\t", "|"), previous_ref_seq)