예제 #1
0
        if not data:
            raise ValueError

        # move on if there's no info on where on the genome it falls
        if data[1] == "--":
            continue

        # figure out the chromosome name
        chr = "chr%s" % (data[1])

        # figure out what those A's and B's mean
        # we must replace the "A" designation first, then "B", because
        # "A" is also an actual unambiguous nucleotide, while "B" is not
        actual_call = call.replace("A", data[4]).replace("B", data[5])
        if data[1] != "--" and data[3] == "-":
            actual_call = reverse_complement(actual_call)

        # prepare output
        out_line = chr
        out_line += "\taffx\tsnp\t"
        out_line += "%s\t%s" % (data[2], data[2]
                                )  # info from the affx databases is 1-based
        out_line += "\t.\t+\t.\t"

        if actual_call[0] != actual_call[1]:
            out_line += "alleles " + "/".join(sorted(list(actual_call)))
        else:
            out_line += "alleles " + actual_call[0]

        out_line += ";db_xref affx:%s dbsnp:%s" % (probe_set_id, data[0])
        out_line += ";ras_1 " + ras_1
 if not data:
     raise ValueError
 
 # move on if there's no info on where on the genome it falls
 if data[1] == "--":
     continue
 
 # figure out the chromosome name
 chr = "chr%s" % (data[1])
 
 # figure out what those A's and B's mean
 # we must replace the "A" designation first, then "B", because
 # "A" is also an actual unambiguous nucleotide, while "B" is not
 actual_call = call.replace("A", data[4]).replace("B", data[5])
 if data[1] != "--" and data[3] == "-":
     actual_call = reverse_complement(actual_call)
 
 # prepare output
 out_line = chr
 out_line += "\taffx\tsnp\t"
 out_line += "%s\t%s" % (data[2], data[2]) # info from the affx databases is 1-based
 out_line += "\t.\t+\t.\t"
 
 if actual_call[0] != actual_call[1]:
     out_line += "alleles " + "/".join(sorted(list(actual_call)))
 else:
     out_line += "alleles " + actual_call[0]
 
 out_line += ";db_xref affx:%s dbsnp:%s" % (probe_set_id, data[0])
 out_line += ";ras_1 " + ras_1
 out_line += ";ras_2 " + ras_2
def infer_function(twobit_file, record, geneName, strand, cdsStart, cdsEnd, exonStarts, exonEnds):
	"""
	Infer "function" (as dbSNP calls it) given a reference TwoBitFile object,
	a GFFRecord object, and info about the gene: name, strand, coding sequence
	start, coding sequence end (both 0-based, half-open), exon starts (comma-
	separated string), and exon ends (comma-separated string).
	
	Returns a tuple consisting of the "function" (coding, 5'-UTR, etc.),
	followed by the exon or intron number (1-based, if applicable), and amino
	acid residue (1-based numeric type, if applicable) or change (1-based
	string, if applicable).
	"""
	# we're done if it's not intronic or exonic
	if (record.strand == "+" and record.start <= cdsStart) or \
	  (record.strand == "-" and record.end > cdsEnd):
		return ("5'-UTR",)
	
	if (record.strand == "+" and record.end > cdsEnd) or \
	  (record.strand == "-" and record.start <= cdsStart):
		return ("3'-UTR",)
	
	# make exonStarts and exonEnds into lists
	# first, we have to make sure they're strings...
	try:
		exonStarts = exonStarts.tostring()
		exonEnds = exonEnds.tostring()
	# if we already have a string, tostring() won't work
	except AttributeError:
		pass
	
	# now, we really make them lists
	exonStarts = [long(e) for e in exonStarts.strip(",").split(",")]
	exonEnds = [long(e) for e in exonEnds.strip(",").split(",")]
	
	# make a list of all exons, in case we need it
	all_exons = zip(exonStarts, exonEnds)

	# reverse for strand; note how we set aside all_exons first before doing this
	if strand == "-":
		exonStarts.reverse()
		exonEnds.reverse()

	# parse out exons
	exons = []
	running_intron_count = running_exon_count = running_cds_bases_count = 0 # 1-based
	
	for j in range(0, len(exonStarts)):
		# discard any non-coding portions with this if statement
		if exonEnds[j] > cdsStart and exonStarts[j] <= cdsEnd:
			
			# trim the start and end to the coding region
			if exonStarts[j] < cdsStart:
				exonStarts[j] = cdsStart
			if exonEnds[j] > cdsEnd:
				exonEnds[j] = cdsEnd
			
			# increment the count
			running_exon_count += 1
			
			# look at the intron, if applicable
			if len(exons) > 0:
				if strand == "+":
					intron_start = exons[-1][1]
#					intron_start = exons[-1][1] - 1 # the end of the last exon considered
					intron_end = exonStarts[j]
				else:
					intron_start = exonEnds[j]
					intron_end = exons[-1][0]
				
				running_intron_count += 1
				
				# test if is in intron (remember, start and end are 1-based)
				# (this only works if record.start = record.end (i.e. SNPs);
				# otherwise, this will need to be adapted by taking strand
				# into account)
				if (record.start > intron_start and record.end <= intron_end):
					return ("intron", running_intron_count)
			
			# look at exon (again, this only works if record.start = record.end
			# and assumes both are 1-based)
			if (record.start > exonStarts[j] and record.end <= exonEnds[j]):
				# figure out number of bases, amino acid residues, frame
				if strand == "+":
					running_cds_bases_count += record.start - exonStarts[j]
					frame_offset = running_cds_bases_count % 3
					if frame_offset == 0:
						frame_offset = 3
						# chr direction =>
						# translation direction =>
						# -------------
						# | 1 | 2 | 3 |
						# -------------
						#   ^ first base of codon
						#
						# note that this convention corresponds to frames 0, 2, 1
						# respectively in GTF notation
				else:
					running_cds_bases_count += exonEnds[j] + 1 - record.end
					frame_offset = -1 * (running_cds_bases_count % 3)
					if frame_offset == 0:
						frame_offset = -3
						# chr direction =>
						# <= translation direction
						# -------------
						# |-3 |-2 |-1 |
						# -------------
						#           ^ first base of codon
						#
						# note that this convention corresponds to frames 1, 2, 0
						# respectively in GTF notation
				
				# ugly, but that's the way it is, we want to divide by 3, then take the ceiling
				# as a (long) integer; so, we convert to float, divide, take the ceiling, then
				# convert back...
				amino_acid_residue = long(math.ceil(float(running_cds_bases_count) / 3))
				
				# figure out what we need, and prepare to look it up
				start_exon, end_exon, intervals = \
				  codon_intersect(record.start - 1, record.end, all_exons, frame_offset)
				
				# calculate the chromosome name we want to use
				if record.seqname.startswith("chr"):
					chr = record.seqname
				else:
					chr = "chr" + record.seqname
				
				# look it up
				ref_seq = "".join([twobit_file[chr][k[0]:k[1]] for k in intervals])
				
				# within each set of intervals, the same codons could have
				# different positions for alternative splicings, etc.
				replacement_coord = (frame_offset + 3) % 4
				
				# figure out which allele is not the mutant
				alleles = record.attributes["alleles"].strip("\"").split("/")
				try:
					alleles.remove(record.attributes["ref_allele"])
				except ValueError:
					pass
				
				# now work through each mutant allele
				amino_acid_changes = []
				is_synonymous = True
				for mut_allele in alleles:				
					mut_seq_list = list(ref_seq)
					mut_seq_list[replacement_coord] = mut_allele
					mut_seq = "".join(mut_seq_list)
					
					if frame_offset > 0 and not chr.startswith("chrM"):
						ref_residue = translate(ref_seq)
						mut_residue = translate(mut_seq)
					elif frame_offset < 0 and not chr.startswith("chrM"):
						ref_residue = translate(reverse_complement(ref_seq))
						mut_residue = translate(reverse_complement(mut_seq))
					elif frame_offset > 0:
						ref_residue = translate(ref_seq, "Vertebrate Mitochondrial")
						mut_residue = translate(mut_seq, "Vertebrate Mitochondrial")
					else:
						ref_residue = translate(reverse_complement(ref_seq),
						  "Vertebrate Mitochondrial")
						mut_residue = translate(reverse_complement(mut_seq),
						  "Vertebrate Mitochondrial")
					
					if ref_residue != mut_residue:
						amino_acid_changes.append(ref_residue + str(amino_acid_residue) + mut_residue)
						is_synonymous = False
				
				# return info
				if not is_synonymous:
					return ("nonsynonymous coding", running_exon_count, " ".join(amino_acid_changes))
				else:
					return ("synonymous coding", running_exon_count, amino_acid_residue)
			
			# otherwise, continue the bookkeeping
			running_cds_bases_count += exonEnds[j] - exonStarts[j]					
			exons.append([exonStarts[j], exonEnds[j]])
예제 #4
0
        
        # quit if we don't have an rs number
        if not rs:
            continue
        # we wouldn't know what to do with this, so pass it up for now
        if len(alleles) > 2:
            continue
        
        # create the genotype string from the given alleles
        #TODO: do something about the Y chromosome
        if len(alleles) == 1:
            genotype = alleles[0]
            alleles = [alleles[0], alleles[0]]
        else:
            genotype = ';'.join(sorted(alleles))
        reverse_alleles = [reverse_complement(a) for a in alleles]
        
        # query the database
        cursor.execute(query, (rs,
                       alleles[0] + '%',
                       alleles[1] + '%',
                       reverse_alleles[0] + '%',
                       reverse_alleles[1] + '%'))
        data = cursor.fetchall()
        
        # move on if we don't have info
        if cursor.rowcount <= 0:
            continue

        gene_acid_base = None
        gene = None
예제 #5
0
			print >> sys.stderr, "# not found:"
			print >> sys.stderr, line
			continue
		# this would be very strange
		if chr != datum[0] and chr != "None":
			print >> sys.stderr, "# not on expected chromosome %s:" % datum[0]
			print >> sys.stderr, line
			continue
		else:
			chr = datum[0]
		
		strand = datum[3]
		
		# filter out genotypes that don't match the reference allele, if asked to
		if option.reference:
			ref = twobit_file[chr][datum[1]:datum[2]]
			if strand == "-":
				ref = reverse_complement(ref)
			if genotype != (ref + ";" + ref):
				continue
		
		print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (phenotype, chr, datum[1], datum[2],
		  strand, genotype, pubmed, rs)
	
	# close database cursor and connection
	cursor.close()
	connection.close()

if __name__ == "__main__":
	main()
예제 #6
0
			if x.startswith("dbsnp:rs"):
				rs = x.replace("dbsnp:", "")
				break
		
		# quit if we don't have an rs number
		if not rs:
			continue
		# we wouldn't know what to do with this, so pass it up for now
		if len(alleles) > 2:
			continue
		
		# create the genotype string from the given alleles
		#TODO: do something about the Y chromosome
		if len(alleles) == 1:
			genotype = alleles[0] + ";" + alleles[0]
			reverse_genotype = reverse_complement(alleles[0]) + ";" + reverse_complement(alleles[0])
		else:
			genotype = ';'.join(sorted(alleles))
			reverse_genotype = ';'.join(sorted([reverse_complement(a) for a in alleles]))
		
		# query the database
		cursor.execute(query, (rs, genotype, reverse_genotype))
		data = cursor.fetchall()
		
		# move on if we don't have info
		if cursor.rowcount <= 0:
			continue
			
		for d in data:
			phenotype = d[0]
			pubmed = d[1]
				unavailable = False
		
		if unavailable:
			l["maf"] = "N/A"
			if "trait_allele" in l:
				l["taf"] = "N/A"
		else:
			# output minor allele frequency as a dictionary, with population abbrs as keys
			l["maf"] = dict(zip([d[1] for d in data], [float(min(d[3], d[5])) for d in data]))
			# output trait allele frequency as a dictionary; this one is a little trickier
			if "trait_allele" in l:
				l["taf"] = {"all_n": 0,
					    "all_d": 0}
				for d in data:
					if d[0] == "+" and l["trait_allele"] == d[2] \
					  or d[0] == "-" and l["trait_allele"] == reverse_complement(d[2]):
						l["taf"][d[1]] = float(d[3])
						l["taf"]["all_n"] += d[6]
					elif d[0] == "+" and l["trait_allele"] == d[4] \
					  or d[0] == "-" and l["trait_allele"] == reverse_complement(d[4]):
						l["taf"][d[1]] = float(d[5])
						l["taf"]["all_n"] += d[7]
					l["taf"]["all_d"] += d[6]+d[7]
		print json.dumps(l)
	
	# close database cursor and connection
	cursor.close()
	connection.close()

if __name__ == "__main__":
	main()
def infer_function(twobit_file, record, geneName, strand, cdsStart, cdsEnd, exonStarts, exonEnds):
    """
    Infer "function" (as dbSNP calls it) given a reference TwoBitFile object,
    a GFFRecord object, and info about the gene: name, strand, coding sequence
    start, coding sequence end (both 0-based, half-open), exon starts (comma-
    separated string), and exon ends (comma-separated string).
    
    Returns a tuple consisting of the "function" (coding, 5'-UTR, etc.),
    followed by the exon or intron number (1-based, if applicable), and amino
    acid residue (1-based numeric type, if applicable) or change (1-based
    string, if applicable).
    """
    # Check chromosome name
    if record.seqname.startswith("chr"):
        chr = record.seqname
    else:
        chr = "chr" + record.seqname

    # Check if it falls entirely outside the gene region
    if (record.strand == "+" and record.end <= cdsStart) or (record.strand == "-" and record.start > cdsEnd):
        return ("5'-UTR",)

    if (record.strand == "+" and record.start > cdsEnd) or (record.strand == "-" and record.end <= cdsStart):
        return ("3'-UTR",)

    # make exonStarts and exonEnds into lists
    # first, we have to make sure they're strings...
    try:
        exonStarts = exonStarts.tostring()
        exonEnds = exonEnds.tostring()
    # if we already have a string, tostring() won't work
    except AttributeError:
        pass

    # now, we really make them lists
    exonStarts = [long(e) for e in exonStarts.strip(",").split(",")]
    exonEnds = [long(e) for e in exonEnds.strip(",").split(",")]

    # make a list of all exons, in case we need it
    all_exons = zip(exonStarts, exonEnds)

    # reverse for strand; note how we set aside all_exons first before doing this
    if strand == "-":
        exonStarts.reverse()
        exonEnds.reverse()

    # Get coordinates of coding sequence
    exonWCodeStarts = []
    exonWCodeEnds = []
    exonCodingRanges = []
    for j in range(len(exonStarts)):
        if exonEnds[j] <= cdsStart or exonStarts[j] > cdsEnd:
            continue
        else:
            start = exonStarts[j]
            end = exonEnds[j]
            if start < cdsStart:
                start = long(cdsStart)
            if end > cdsEnd:
                end = long(cdsEnd)
            exonWCodeStarts.append(exonStarts[j])
            exonWCodeEnds.append(exonEnds[j])
            exonCodingRanges.append((start, end))

    # parse out exons
    exons = []
    exon_seqs = []
    running_intron_count = running_exon_count = running_cds_bases_count = 0  # 1-based
    trimmed_bases = 0

    for j in range(0, len(exonWCodeStarts)):
        # skip exons we know are noncoding (we reported UTR already earlier)
        if exonWCodeEnds[j] > cdsStart and exonWCodeStarts[j] <= cdsEnd:

            # Commenting out splice prediction for now - MPB 2010/12/03
            """
            # check if it is spanning or within 2bp of the splice junction
            overlap_start = (record.start <= exonWCodeStarts[j] and record.end > exonWCodeStarts[j] - 2) \
                            and exonWCodeStarts[j] >= cdsStart
            overlap_end = (record.start <= exonWCodeEnds[j] + 2 and record.end > exonWCodeEnds[j]) \
                            and exonWCodeEnds[j] <= cdsEnd
            before_seq = after_seq = ""
            if overlap_start:
                if strand == "-":
                    before_seq = reverse_complement("".join([twobit_file[chr][e[0]:e[1]] for e in reversed(exonCodingRanges[0:j+1])]))
                    after_seq = reverse_complement(twobit_file[chr][exonCodingRanges[j+1][0]:exonCodingRanges[j+1][1]])
                else:
                    before_seq = "".join([twobit_file[chr][e[0]:e[1]] for e in exonCodingRanges[0:j]])
                    after_seq = twobit_file[chr][exonCodingRanges[j][0]:exonCodingRanges[j][1]]
            elif overlap_end:
                if strand == "-":
                    before_seq = reverse_complement("".join([twobit_file[chr][e[0]:e[1]] for e in reversed(exonCodingRanges[0:j])]))
                    after_seq = reverse_complement(twobit_file[chr][exonCodingRanges[j][0]:exonCodingRanges[j][1]])
                else:
                    before_seq = "".join([twobit_file[chr][e[0]:e[1]] for e in exonCodingRanges[0:j+1]])
                    after_seq = twobit_file[chr][exonCodingRanges[j+1][0]:exonCodingRanges[j+1][1]]
            if (overlap_start or overlap_end):
                desc = ""
                if len(before_seq) % 3 == 0:
                    var1 = codon_1to3(translate(before_seq[-3:]))
                    var2 = codon_1to3(translate(after_seq[:3]))
                    pos = len(before_seq) / 3
                    desc = var1 + "-" + var2 + str(pos) + "-" + str(pos+1) + "Splice"
                else:
                    aa = translate(before_seq + after_seq)
                    var = codon_1to3(aa[(len(before_seq)/3)])
                    pos = 1 + len(before_seq) / 3
                    desc = var + str(pos) + "Splice"
                    # print "Predicting splice start -, seq_ref: " + seq_ref + " next_exon: " + next_exon + " var: " + var + " desc: " + desc
                return ("splice site",1,desc)
            """

            # trim the start and end to the coding region
            if exonWCodeStarts[j] < cdsStart:
                trimmed_bases = cdsStart - exonWCodeStarts[j]
                exonWCodeStarts[j] = cdsStart
            if exonWCodeEnds[j] > cdsEnd:
                trimmed_bases = exonWCodeEnds[j] - cdsEnd
                exonWCodeEnds[j] = cdsEnd

            # check if it's in intron
            if len(exons) > 0:
                if strand == "+":
                    intron_start = exons[-1][1]
                    intron_end = exonWCodeStarts[j]
                else:
                    intron_start = exonWCodeEnds[j]
                    intron_end = exons[-1][0]

                running_intron_count += 1

                # test if is in within intron
                if record.start > intron_start and record.end <= intron_end:
                    return ("intron", running_intron_count)

            # skip variants spanning start or end of coding region
            # (we haven't worked out how to report these yet)
            if (
                (record.start <= cdsStart and record.end > cdsStart)
                or (record.start <= cdsEnd and record.end > cdsEnd)
                or (record.start - 1 == record.end == cdsStart)
                or (record.start == record.end + 1 == cdsEnd)
            ):
                return ("span_coding_edge",)

            # skip variants spanning start or end of exon boundaries
            # (we haven't worked out how to report these yet)
            if (record.start <= exonWCodeStarts[j] and record.end > exonWCodeStarts[j]) or (
                record.start <= exonWCodeEnds[j] and record.end > exonWCodeEnds[j]
            ):
                return ("span_exon_boundary",)

            if (record.start > exonWCodeStarts[j] and record.start <= exonWCodeEnds[j]) and (
                record.end > exonWCodeStarts[j] and record.end <= exonWCodeEnds[j]
            ):

                # get alleles and length is reference genome
                alleles = record.attributes["alleles"].strip('"').split("/")
                for i in range(len(alleles)):
                    if alleles[i] == "-":
                        alleles[i] = ""
                ref_allele = record.attributes["ref_allele"]
                if ref_allele == "-":
                    ref_allele = ""
                if len(ref_allele) != record.end + 1 - record.start:
                    sys.exit(
                        "Reference allele length doesn't match GFF positions! ref_allele: \""
                        + record.attributes["ref_allele"]
                        + '", start: '
                        + str(record.start)
                        + " end: "
                        + str(record.end)
                    )
                try:
                    alleles.remove(ref_allele)
                except ValueError:
                    pass

                # Generate reference and variant coding region DNA sequences
                seq_var = []
                seq_ref = seq_ref_pre = seq_ref_post = ""
                if strand == "-":
                    seq_ref = "".join([twobit_file[chr][e[0] : e[1]] for e in reversed(exonCodingRanges)])
                    seq_ref = reverse_complement(seq_ref)
                    seq_ref_pre = "".join([twobit_file[chr][e[0] : e[1]] for e in reversed(exonCodingRanges[:j])])
                    seq_ref_post = "".join([twobit_file[chr][e[0] : e[1]] for e in reversed(exonCodingRanges[j + 1 :])])
                else:
                    seq_ref = "".join([twobit_file[chr][e[0] : e[1]] for e in exonCodingRanges])
                    seq_ref_pre = "".join([twobit_file[chr][e[0] : e[1]] for e in exonCodingRanges[:j]])
                    seq_ref_post = "".join([twobit_file[chr][e[0] : e[1]] for e in exonCodingRanges[j + 1 :]])
                for allele in alleles:
                    seq = ""
                    if strand == "-":
                        seq = (
                            seq_ref_post
                            + twobit_file[chr][exonCodingRanges[j][0] : (record.start - 1)]
                            + allele
                            + twobit_file[chr][record.end : exonCodingRanges[j][1]]
                            + seq_ref_pre
                        )
                        seq = reverse_complement(seq)
                    else:
                        seq = (
                            seq_ref_pre
                            + twobit_file[chr][exonCodingRanges[j][0] : (record.start - 1)]
                            + allele
                            + twobit_file[chr][record.end : exonCodingRanges[j][1]]
                            + seq_ref_post
                        )
                    seq_var.append(seq)

                # Get variants
                amino_acid_changes = []
                for i in range(len(alleles)):
                    variant_descriptions = []
                    try:
                        variant_descriptions = desc_variants(seq_ref, seq_var[i])
                    except AssertionError:
                        continue
                    if variant_descriptions:
                        amino_acid_changes.append(variant_descriptions)
                if amino_acid_changes:
                    return ("nonsynonymous coding", 1, " ".join(amino_acid_changes))
                else:
                    return ("synonymous coding",)

            exons.append([exonWCodeStarts[j], exonWCodeEnds[j]])
def infer_function(twobit_file, record, geneName, strand, cdsStart, cdsEnd,
                   exonStarts, exonEnds):
    """
    Infer "function" (as dbSNP calls it) given a reference TwoBitFile object,
    a GFFRecord object, and info about the gene: name, strand, coding sequence
    start, coding sequence end (both 0-based, half-open), exon starts (comma-
    separated string), and exon ends (comma-separated string).
    
    Returns a tuple consisting of the "function" (coding, 5'-UTR, etc.),
    followed by the exon or intron number (1-based, if applicable), and amino
    acid residue (1-based numeric type, if applicable) or change (1-based
    string, if applicable).
    """
    # Check chromosome name
    if record.seqname.startswith("chr"):
        chr = record.seqname
    else:
        chr = "chr" + record.seqname

    # Check if it falls entirely outside the gene region
    if (record.strand == "+" and record.end <= cdsStart) or \
      (record.strand == "-" and record.start > cdsEnd):
        return ("5'-UTR", )

    if (record.strand == "+" and record.start > cdsEnd) or \
      (record.strand == "-" and record.end <= cdsStart):
        return ("3'-UTR", )

    # make exonStarts and exonEnds into lists
    # first, we have to make sure they're strings...
    try:
        exonStarts = exonStarts.tostring()
        exonEnds = exonEnds.tostring()
    # if we already have a string, tostring() won't work
    except AttributeError:
        pass

    # now, we really make them lists
    exonStarts = [long(e) for e in exonStarts.strip(",").split(",")]
    exonEnds = [long(e) for e in exonEnds.strip(",").split(",")]

    # make a list of all exons, in case we need it
    all_exons = zip(exonStarts, exonEnds)

    # reverse for strand; note how we set aside all_exons first before doing this
    if strand == "-":
        exonStarts.reverse()
        exonEnds.reverse()

    # Get coordinates of coding sequence
    exonWCodeStarts = []
    exonWCodeEnds = []
    exonCodingRanges = []
    for j in range(len(exonStarts)):
        if (exonEnds[j] <= cdsStart or exonStarts[j] > cdsEnd):
            continue
        else:
            start = exonStarts[j]
            end = exonEnds[j]
            if start < cdsStart:
                start = long(cdsStart)
            if end > cdsEnd:
                end = long(cdsEnd)
            exonWCodeStarts.append(exonStarts[j])
            exonWCodeEnds.append(exonEnds[j])
            exonCodingRanges.append((start, end))

    # parse out exons
    exons = []
    exon_seqs = []
    running_intron_count = running_exon_count = running_cds_bases_count = 0  # 1-based
    trimmed_bases = 0

    for j in range(0, len(exonWCodeStarts)):
        # skip exons we know are noncoding (we reported UTR already earlier)
        if exonWCodeEnds[j] > cdsStart and exonWCodeStarts[j] <= cdsEnd:

            # Commenting out splice prediction for now - MPB 2010/12/03
            '''
            # check if it is spanning or within 2bp of the splice junction
            overlap_start = (record.start <= exonWCodeStarts[j] and record.end > exonWCodeStarts[j] - 2) \
                            and exonWCodeStarts[j] >= cdsStart
            overlap_end = (record.start <= exonWCodeEnds[j] + 2 and record.end > exonWCodeEnds[j]) \
                            and exonWCodeEnds[j] <= cdsEnd
            before_seq = after_seq = ""
            if overlap_start:
                if strand == "-":
                    before_seq = reverse_complement("".join([twobit_file[chr][e[0]:e[1]] for e in reversed(exonCodingRanges[0:j+1])]))
                    after_seq = reverse_complement(twobit_file[chr][exonCodingRanges[j+1][0]:exonCodingRanges[j+1][1]])
                else:
                    before_seq = "".join([twobit_file[chr][e[0]:e[1]] for e in exonCodingRanges[0:j]])
                    after_seq = twobit_file[chr][exonCodingRanges[j][0]:exonCodingRanges[j][1]]
            elif overlap_end:
                if strand == "-":
                    before_seq = reverse_complement("".join([twobit_file[chr][e[0]:e[1]] for e in reversed(exonCodingRanges[0:j])]))
                    after_seq = reverse_complement(twobit_file[chr][exonCodingRanges[j][0]:exonCodingRanges[j][1]])
                else:
                    before_seq = "".join([twobit_file[chr][e[0]:e[1]] for e in exonCodingRanges[0:j+1]])
                    after_seq = twobit_file[chr][exonCodingRanges[j+1][0]:exonCodingRanges[j+1][1]]
            if (overlap_start or overlap_end):
                desc = ""
                if len(before_seq) % 3 == 0:
                    var1 = codon_1to3(translate(before_seq[-3:]))
                    var2 = codon_1to3(translate(after_seq[:3]))
                    pos = len(before_seq) / 3
                    desc = var1 + "-" + var2 + str(pos) + "-" + str(pos+1) + "Splice"
                else:
                    aa = translate(before_seq + after_seq)
                    var = codon_1to3(aa[(len(before_seq)/3)])
                    pos = 1 + len(before_seq) / 3
                    desc = var + str(pos) + "Splice"
                    # print "Predicting splice start -, seq_ref: " + seq_ref + " next_exon: " + next_exon + " var: " + var + " desc: " + desc
                return ("splice site",1,desc)
            '''

            # trim the start and end to the coding region
            if exonWCodeStarts[j] < cdsStart:
                trimmed_bases = cdsStart - exonWCodeStarts[j]
                exonWCodeStarts[j] = cdsStart
            if exonWCodeEnds[j] > cdsEnd:
                trimmed_bases = exonWCodeEnds[j] - cdsEnd
                exonWCodeEnds[j] = cdsEnd

            # check if it's in intron
            if len(exons) > 0:
                if strand == "+":
                    intron_start = exons[-1][1]
                    intron_end = exonWCodeStarts[j]
                else:
                    intron_start = exonWCodeEnds[j]
                    intron_end = exons[-1][0]

                running_intron_count += 1

                # test if is in within intron
                if (record.start > intron_start and record.end <= intron_end):
                    return ("intron", running_intron_count)

            # skip variants spanning start or end of coding region
            # (we haven't worked out how to report these yet)
            if (record.start <= cdsStart and record.end > cdsStart) or \
                (record.start <= cdsEnd and record.end > cdsEnd) or \
                (record.start - 1 == record.end == cdsStart) or \
                (record.start == record.end + 1 == cdsEnd):
                return ("span_coding_edge", )

            # skip variants spanning start or end of exon boundaries
            # (we haven't worked out how to report these yet)
            if (record.start <= exonWCodeStarts[j] and record.end > exonWCodeStarts[j]) or \
                (record.start <= exonWCodeEnds[j] and record.end > exonWCodeEnds[j]):
                return ("span_exon_boundary", )

            if ( (record.start > exonWCodeStarts[j] and record.start <= exonWCodeEnds[j]) \
                and (record.end > exonWCodeStarts[j] and record.end <= exonWCodeEnds[j])):

                # get alleles and length is reference genome
                alleles = record.attributes["alleles"].strip("\"").split("/")
                for i in range(len(alleles)):
                    if alleles[i] == "-":
                        alleles[i] = ""
                ref_allele = record.attributes["ref_allele"]
                if ref_allele == "-":
                    ref_allele = ""
                if (len(ref_allele) != record.end + 1 - record.start):
                    sys.exit("Reference allele length doesn't match GFF positions! ref_allele: \""  \
                        + record.attributes["ref_allele"] + "\", start: " + str(record.start) + " end: " \
                        + str(record.end))
                try:
                    alleles.remove(ref_allele)
                except ValueError:
                    pass

                # Generate reference and variant coding region DNA sequences
                seq_var = []
                seq_ref = seq_ref_pre = seq_ref_post = ""
                if strand == "-":
                    seq_ref = "".join([
                        twobit_file[chr][e[0]:e[1]]
                        for e in reversed(exonCodingRanges)
                    ])
                    seq_ref = reverse_complement(seq_ref)
                    seq_ref_pre = "".join([
                        twobit_file[chr][e[0]:e[1]]
                        for e in reversed(exonCodingRanges[:j])
                    ])
                    seq_ref_post = "".join([
                        twobit_file[chr][e[0]:e[1]]
                        for e in reversed(exonCodingRanges[j + 1:])
                    ])
                else:
                    seq_ref = "".join([
                        twobit_file[chr][e[0]:e[1]] for e in exonCodingRanges
                    ])
                    seq_ref_pre = "".join([
                        twobit_file[chr][e[0]:e[1]]
                        for e in exonCodingRanges[:j]
                    ])
                    seq_ref_post = "".join([
                        twobit_file[chr][e[0]:e[1]]
                        for e in exonCodingRanges[j + 1:]
                    ])
                for allele in alleles:
                    seq = ""
                    if strand == "-":
                        seq = seq_ref_post + twobit_file[chr][exonCodingRanges[j][0]:(record.start - 1)] \
                            + allele + twobit_file[chr][record.end:exonCodingRanges[j][1]] + seq_ref_pre
                        seq = reverse_complement(seq)
                    else:
                        seq = seq_ref_pre + twobit_file[chr][exonCodingRanges[j][0]:(record.start - 1)] \
                            + allele + twobit_file[chr][record.end:exonCodingRanges[j][1]] + seq_ref_post
                    seq_var.append(seq)

                # Get variants
                amino_acid_changes = []
                for i in range(len(alleles)):
                    variant_descriptions = []
                    try:
                        variant_descriptions = desc_variants(
                            seq_ref, seq_var[i])
                    except AssertionError:
                        continue
                    if (variant_descriptions):
                        amino_acid_changes.append(variant_descriptions)
                if amino_acid_changes:
                    return ("nonsynonymous coding", 1,
                            " ".join(amino_acid_changes))
                else:
                    return ("synonymous coding", )

            exons.append([exonWCodeStarts[j], exonWCodeEnds[j]])
예제 #10
0
def main():
	# parse options
	option, args = doc_optparse.parse(__doc__)
	
	if len(args) < 2:
		doc_optparse.exit()
	
	flank = int(option.flank or 0)
	
	# try opening the file both ways, in case the arguments got confused
	try:
		gff_file = gff.input(args[1])
		twobit_file = twobit.input(args[0])
	except Exception:
		gff_file = gff.input(args[0])
		twobit_file = twobit.input(args[1])
	
	# initialize a set of variables to keep track of uniqueness, if we need them
	if option.unique:
		previous_record = None
		previous_ref_seq = None
		repetition_count = 1
	
	for record in gff_file:
		# if we're using the unique option, output the previous record only when
		# we're sure we've seen all repetitions of it
		if option.unique and record == previous_record:
			repetition_count += 1
			continue
		elif option.unique:
			if previous_record:
				previous_record.attributes["repetition_count"] = str(repetition_count)
				print FastaRecord(str(previous_record).replace("\t", "|"), previous_ref_seq)
			repetition_count = 1
			previous_record = record

		if record.seqname.startswith("chr"):
			chr = record.seqname
		else:
			chr = "chr" + record.seqname
		
		ref_seq = twobit_file[chr][(record.start - 1):record.end]

		if flank != 0:
			# calculate the flanks (these variables are 0-based)
			left_flank_start = record.start - flank - 1
			left_flank_end = record.start - 1
			if left_flank_start < 0:
				left_flank_start = 0
			
			right_flank_start = record.end
			right_flank_end = record.end + flank
			
			# now find them
			left_flank_seq = twobit_file[chr][left_flank_start:left_flank_end]
			right_flank_seq = twobit_file[chr][right_flank_start:right_flank_end]
			ref_seq = left_flank_seq + "\n\n" + ref_seq + "\n\n" + right_flank_seq
		
		if option.strand and record.strand == "-":
			ref_seq = reverse_complement(ref_seq)
		
		# we don't output the current record if we're using the unique option
		if option.unique:
			previous_ref_seq = ref_seq
		else:
			print FastaRecord(str(record).replace("\t", "|"), ref_seq)
	
	# we'll have one last record yet to output if we used the unique option
	if option.unique:
		previous_record.attributes["repetition_count"] = str(repetition_count)
		print FastaRecord(str(previous_record).replace("\t", "|"), previous_ref_seq)