if not data: raise ValueError # move on if there's no info on where on the genome it falls if data[1] == "--": continue # figure out the chromosome name chr = "chr%s" % (data[1]) # figure out what those A's and B's mean # we must replace the "A" designation first, then "B", because # "A" is also an actual unambiguous nucleotide, while "B" is not actual_call = call.replace("A", data[4]).replace("B", data[5]) if data[1] != "--" and data[3] == "-": actual_call = reverse_complement(actual_call) # prepare output out_line = chr out_line += "\taffx\tsnp\t" out_line += "%s\t%s" % (data[2], data[2] ) # info from the affx databases is 1-based out_line += "\t.\t+\t.\t" if actual_call[0] != actual_call[1]: out_line += "alleles " + "/".join(sorted(list(actual_call))) else: out_line += "alleles " + actual_call[0] out_line += ";db_xref affx:%s dbsnp:%s" % (probe_set_id, data[0]) out_line += ";ras_1 " + ras_1
if not data: raise ValueError # move on if there's no info on where on the genome it falls if data[1] == "--": continue # figure out the chromosome name chr = "chr%s" % (data[1]) # figure out what those A's and B's mean # we must replace the "A" designation first, then "B", because # "A" is also an actual unambiguous nucleotide, while "B" is not actual_call = call.replace("A", data[4]).replace("B", data[5]) if data[1] != "--" and data[3] == "-": actual_call = reverse_complement(actual_call) # prepare output out_line = chr out_line += "\taffx\tsnp\t" out_line += "%s\t%s" % (data[2], data[2]) # info from the affx databases is 1-based out_line += "\t.\t+\t.\t" if actual_call[0] != actual_call[1]: out_line += "alleles " + "/".join(sorted(list(actual_call))) else: out_line += "alleles " + actual_call[0] out_line += ";db_xref affx:%s dbsnp:%s" % (probe_set_id, data[0]) out_line += ";ras_1 " + ras_1 out_line += ";ras_2 " + ras_2
def infer_function(twobit_file, record, geneName, strand, cdsStart, cdsEnd, exonStarts, exonEnds): """ Infer "function" (as dbSNP calls it) given a reference TwoBitFile object, a GFFRecord object, and info about the gene: name, strand, coding sequence start, coding sequence end (both 0-based, half-open), exon starts (comma- separated string), and exon ends (comma-separated string). Returns a tuple consisting of the "function" (coding, 5'-UTR, etc.), followed by the exon or intron number (1-based, if applicable), and amino acid residue (1-based numeric type, if applicable) or change (1-based string, if applicable). """ # we're done if it's not intronic or exonic if (record.strand == "+" and record.start <= cdsStart) or \ (record.strand == "-" and record.end > cdsEnd): return ("5'-UTR",) if (record.strand == "+" and record.end > cdsEnd) or \ (record.strand == "-" and record.start <= cdsStart): return ("3'-UTR",) # make exonStarts and exonEnds into lists # first, we have to make sure they're strings... try: exonStarts = exonStarts.tostring() exonEnds = exonEnds.tostring() # if we already have a string, tostring() won't work except AttributeError: pass # now, we really make them lists exonStarts = [long(e) for e in exonStarts.strip(",").split(",")] exonEnds = [long(e) for e in exonEnds.strip(",").split(",")] # make a list of all exons, in case we need it all_exons = zip(exonStarts, exonEnds) # reverse for strand; note how we set aside all_exons first before doing this if strand == "-": exonStarts.reverse() exonEnds.reverse() # parse out exons exons = [] running_intron_count = running_exon_count = running_cds_bases_count = 0 # 1-based for j in range(0, len(exonStarts)): # discard any non-coding portions with this if statement if exonEnds[j] > cdsStart and exonStarts[j] <= cdsEnd: # trim the start and end to the coding region if exonStarts[j] < cdsStart: exonStarts[j] = cdsStart if exonEnds[j] > cdsEnd: exonEnds[j] = cdsEnd # increment the count running_exon_count += 1 # look at the intron, if applicable if len(exons) > 0: if strand == "+": intron_start = exons[-1][1] # intron_start = exons[-1][1] - 1 # the end of the last exon considered intron_end = exonStarts[j] else: intron_start = exonEnds[j] intron_end = exons[-1][0] running_intron_count += 1 # test if is in intron (remember, start and end are 1-based) # (this only works if record.start = record.end (i.e. SNPs); # otherwise, this will need to be adapted by taking strand # into account) if (record.start > intron_start and record.end <= intron_end): return ("intron", running_intron_count) # look at exon (again, this only works if record.start = record.end # and assumes both are 1-based) if (record.start > exonStarts[j] and record.end <= exonEnds[j]): # figure out number of bases, amino acid residues, frame if strand == "+": running_cds_bases_count += record.start - exonStarts[j] frame_offset = running_cds_bases_count % 3 if frame_offset == 0: frame_offset = 3 # chr direction => # translation direction => # ------------- # | 1 | 2 | 3 | # ------------- # ^ first base of codon # # note that this convention corresponds to frames 0, 2, 1 # respectively in GTF notation else: running_cds_bases_count += exonEnds[j] + 1 - record.end frame_offset = -1 * (running_cds_bases_count % 3) if frame_offset == 0: frame_offset = -3 # chr direction => # <= translation direction # ------------- # |-3 |-2 |-1 | # ------------- # ^ first base of codon # # note that this convention corresponds to frames 1, 2, 0 # respectively in GTF notation # ugly, but that's the way it is, we want to divide by 3, then take the ceiling # as a (long) integer; so, we convert to float, divide, take the ceiling, then # convert back... amino_acid_residue = long(math.ceil(float(running_cds_bases_count) / 3)) # figure out what we need, and prepare to look it up start_exon, end_exon, intervals = \ codon_intersect(record.start - 1, record.end, all_exons, frame_offset) # calculate the chromosome name we want to use if record.seqname.startswith("chr"): chr = record.seqname else: chr = "chr" + record.seqname # look it up ref_seq = "".join([twobit_file[chr][k[0]:k[1]] for k in intervals]) # within each set of intervals, the same codons could have # different positions for alternative splicings, etc. replacement_coord = (frame_offset + 3) % 4 # figure out which allele is not the mutant alleles = record.attributes["alleles"].strip("\"").split("/") try: alleles.remove(record.attributes["ref_allele"]) except ValueError: pass # now work through each mutant allele amino_acid_changes = [] is_synonymous = True for mut_allele in alleles: mut_seq_list = list(ref_seq) mut_seq_list[replacement_coord] = mut_allele mut_seq = "".join(mut_seq_list) if frame_offset > 0 and not chr.startswith("chrM"): ref_residue = translate(ref_seq) mut_residue = translate(mut_seq) elif frame_offset < 0 and not chr.startswith("chrM"): ref_residue = translate(reverse_complement(ref_seq)) mut_residue = translate(reverse_complement(mut_seq)) elif frame_offset > 0: ref_residue = translate(ref_seq, "Vertebrate Mitochondrial") mut_residue = translate(mut_seq, "Vertebrate Mitochondrial") else: ref_residue = translate(reverse_complement(ref_seq), "Vertebrate Mitochondrial") mut_residue = translate(reverse_complement(mut_seq), "Vertebrate Mitochondrial") if ref_residue != mut_residue: amino_acid_changes.append(ref_residue + str(amino_acid_residue) + mut_residue) is_synonymous = False # return info if not is_synonymous: return ("nonsynonymous coding", running_exon_count, " ".join(amino_acid_changes)) else: return ("synonymous coding", running_exon_count, amino_acid_residue) # otherwise, continue the bookkeeping running_cds_bases_count += exonEnds[j] - exonStarts[j] exons.append([exonStarts[j], exonEnds[j]])
# quit if we don't have an rs number if not rs: continue # we wouldn't know what to do with this, so pass it up for now if len(alleles) > 2: continue # create the genotype string from the given alleles #TODO: do something about the Y chromosome if len(alleles) == 1: genotype = alleles[0] alleles = [alleles[0], alleles[0]] else: genotype = ';'.join(sorted(alleles)) reverse_alleles = [reverse_complement(a) for a in alleles] # query the database cursor.execute(query, (rs, alleles[0] + '%', alleles[1] + '%', reverse_alleles[0] + '%', reverse_alleles[1] + '%')) data = cursor.fetchall() # move on if we don't have info if cursor.rowcount <= 0: continue gene_acid_base = None gene = None
print >> sys.stderr, "# not found:" print >> sys.stderr, line continue # this would be very strange if chr != datum[0] and chr != "None": print >> sys.stderr, "# not on expected chromosome %s:" % datum[0] print >> sys.stderr, line continue else: chr = datum[0] strand = datum[3] # filter out genotypes that don't match the reference allele, if asked to if option.reference: ref = twobit_file[chr][datum[1]:datum[2]] if strand == "-": ref = reverse_complement(ref) if genotype != (ref + ";" + ref): continue print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (phenotype, chr, datum[1], datum[2], strand, genotype, pubmed, rs) # close database cursor and connection cursor.close() connection.close() if __name__ == "__main__": main()
if x.startswith("dbsnp:rs"): rs = x.replace("dbsnp:", "") break # quit if we don't have an rs number if not rs: continue # we wouldn't know what to do with this, so pass it up for now if len(alleles) > 2: continue # create the genotype string from the given alleles #TODO: do something about the Y chromosome if len(alleles) == 1: genotype = alleles[0] + ";" + alleles[0] reverse_genotype = reverse_complement(alleles[0]) + ";" + reverse_complement(alleles[0]) else: genotype = ';'.join(sorted(alleles)) reverse_genotype = ';'.join(sorted([reverse_complement(a) for a in alleles])) # query the database cursor.execute(query, (rs, genotype, reverse_genotype)) data = cursor.fetchall() # move on if we don't have info if cursor.rowcount <= 0: continue for d in data: phenotype = d[0] pubmed = d[1]
unavailable = False if unavailable: l["maf"] = "N/A" if "trait_allele" in l: l["taf"] = "N/A" else: # output minor allele frequency as a dictionary, with population abbrs as keys l["maf"] = dict(zip([d[1] for d in data], [float(min(d[3], d[5])) for d in data])) # output trait allele frequency as a dictionary; this one is a little trickier if "trait_allele" in l: l["taf"] = {"all_n": 0, "all_d": 0} for d in data: if d[0] == "+" and l["trait_allele"] == d[2] \ or d[0] == "-" and l["trait_allele"] == reverse_complement(d[2]): l["taf"][d[1]] = float(d[3]) l["taf"]["all_n"] += d[6] elif d[0] == "+" and l["trait_allele"] == d[4] \ or d[0] == "-" and l["trait_allele"] == reverse_complement(d[4]): l["taf"][d[1]] = float(d[5]) l["taf"]["all_n"] += d[7] l["taf"]["all_d"] += d[6]+d[7] print json.dumps(l) # close database cursor and connection cursor.close() connection.close() if __name__ == "__main__": main()
def infer_function(twobit_file, record, geneName, strand, cdsStart, cdsEnd, exonStarts, exonEnds): """ Infer "function" (as dbSNP calls it) given a reference TwoBitFile object, a GFFRecord object, and info about the gene: name, strand, coding sequence start, coding sequence end (both 0-based, half-open), exon starts (comma- separated string), and exon ends (comma-separated string). Returns a tuple consisting of the "function" (coding, 5'-UTR, etc.), followed by the exon or intron number (1-based, if applicable), and amino acid residue (1-based numeric type, if applicable) or change (1-based string, if applicable). """ # Check chromosome name if record.seqname.startswith("chr"): chr = record.seqname else: chr = "chr" + record.seqname # Check if it falls entirely outside the gene region if (record.strand == "+" and record.end <= cdsStart) or (record.strand == "-" and record.start > cdsEnd): return ("5'-UTR",) if (record.strand == "+" and record.start > cdsEnd) or (record.strand == "-" and record.end <= cdsStart): return ("3'-UTR",) # make exonStarts and exonEnds into lists # first, we have to make sure they're strings... try: exonStarts = exonStarts.tostring() exonEnds = exonEnds.tostring() # if we already have a string, tostring() won't work except AttributeError: pass # now, we really make them lists exonStarts = [long(e) for e in exonStarts.strip(",").split(",")] exonEnds = [long(e) for e in exonEnds.strip(",").split(",")] # make a list of all exons, in case we need it all_exons = zip(exonStarts, exonEnds) # reverse for strand; note how we set aside all_exons first before doing this if strand == "-": exonStarts.reverse() exonEnds.reverse() # Get coordinates of coding sequence exonWCodeStarts = [] exonWCodeEnds = [] exonCodingRanges = [] for j in range(len(exonStarts)): if exonEnds[j] <= cdsStart or exonStarts[j] > cdsEnd: continue else: start = exonStarts[j] end = exonEnds[j] if start < cdsStart: start = long(cdsStart) if end > cdsEnd: end = long(cdsEnd) exonWCodeStarts.append(exonStarts[j]) exonWCodeEnds.append(exonEnds[j]) exonCodingRanges.append((start, end)) # parse out exons exons = [] exon_seqs = [] running_intron_count = running_exon_count = running_cds_bases_count = 0 # 1-based trimmed_bases = 0 for j in range(0, len(exonWCodeStarts)): # skip exons we know are noncoding (we reported UTR already earlier) if exonWCodeEnds[j] > cdsStart and exonWCodeStarts[j] <= cdsEnd: # Commenting out splice prediction for now - MPB 2010/12/03 """ # check if it is spanning or within 2bp of the splice junction overlap_start = (record.start <= exonWCodeStarts[j] and record.end > exonWCodeStarts[j] - 2) \ and exonWCodeStarts[j] >= cdsStart overlap_end = (record.start <= exonWCodeEnds[j] + 2 and record.end > exonWCodeEnds[j]) \ and exonWCodeEnds[j] <= cdsEnd before_seq = after_seq = "" if overlap_start: if strand == "-": before_seq = reverse_complement("".join([twobit_file[chr][e[0]:e[1]] for e in reversed(exonCodingRanges[0:j+1])])) after_seq = reverse_complement(twobit_file[chr][exonCodingRanges[j+1][0]:exonCodingRanges[j+1][1]]) else: before_seq = "".join([twobit_file[chr][e[0]:e[1]] for e in exonCodingRanges[0:j]]) after_seq = twobit_file[chr][exonCodingRanges[j][0]:exonCodingRanges[j][1]] elif overlap_end: if strand == "-": before_seq = reverse_complement("".join([twobit_file[chr][e[0]:e[1]] for e in reversed(exonCodingRanges[0:j])])) after_seq = reverse_complement(twobit_file[chr][exonCodingRanges[j][0]:exonCodingRanges[j][1]]) else: before_seq = "".join([twobit_file[chr][e[0]:e[1]] for e in exonCodingRanges[0:j+1]]) after_seq = twobit_file[chr][exonCodingRanges[j+1][0]:exonCodingRanges[j+1][1]] if (overlap_start or overlap_end): desc = "" if len(before_seq) % 3 == 0: var1 = codon_1to3(translate(before_seq[-3:])) var2 = codon_1to3(translate(after_seq[:3])) pos = len(before_seq) / 3 desc = var1 + "-" + var2 + str(pos) + "-" + str(pos+1) + "Splice" else: aa = translate(before_seq + after_seq) var = codon_1to3(aa[(len(before_seq)/3)]) pos = 1 + len(before_seq) / 3 desc = var + str(pos) + "Splice" # print "Predicting splice start -, seq_ref: " + seq_ref + " next_exon: " + next_exon + " var: " + var + " desc: " + desc return ("splice site",1,desc) """ # trim the start and end to the coding region if exonWCodeStarts[j] < cdsStart: trimmed_bases = cdsStart - exonWCodeStarts[j] exonWCodeStarts[j] = cdsStart if exonWCodeEnds[j] > cdsEnd: trimmed_bases = exonWCodeEnds[j] - cdsEnd exonWCodeEnds[j] = cdsEnd # check if it's in intron if len(exons) > 0: if strand == "+": intron_start = exons[-1][1] intron_end = exonWCodeStarts[j] else: intron_start = exonWCodeEnds[j] intron_end = exons[-1][0] running_intron_count += 1 # test if is in within intron if record.start > intron_start and record.end <= intron_end: return ("intron", running_intron_count) # skip variants spanning start or end of coding region # (we haven't worked out how to report these yet) if ( (record.start <= cdsStart and record.end > cdsStart) or (record.start <= cdsEnd and record.end > cdsEnd) or (record.start - 1 == record.end == cdsStart) or (record.start == record.end + 1 == cdsEnd) ): return ("span_coding_edge",) # skip variants spanning start or end of exon boundaries # (we haven't worked out how to report these yet) if (record.start <= exonWCodeStarts[j] and record.end > exonWCodeStarts[j]) or ( record.start <= exonWCodeEnds[j] and record.end > exonWCodeEnds[j] ): return ("span_exon_boundary",) if (record.start > exonWCodeStarts[j] and record.start <= exonWCodeEnds[j]) and ( record.end > exonWCodeStarts[j] and record.end <= exonWCodeEnds[j] ): # get alleles and length is reference genome alleles = record.attributes["alleles"].strip('"').split("/") for i in range(len(alleles)): if alleles[i] == "-": alleles[i] = "" ref_allele = record.attributes["ref_allele"] if ref_allele == "-": ref_allele = "" if len(ref_allele) != record.end + 1 - record.start: sys.exit( "Reference allele length doesn't match GFF positions! ref_allele: \"" + record.attributes["ref_allele"] + '", start: ' + str(record.start) + " end: " + str(record.end) ) try: alleles.remove(ref_allele) except ValueError: pass # Generate reference and variant coding region DNA sequences seq_var = [] seq_ref = seq_ref_pre = seq_ref_post = "" if strand == "-": seq_ref = "".join([twobit_file[chr][e[0] : e[1]] for e in reversed(exonCodingRanges)]) seq_ref = reverse_complement(seq_ref) seq_ref_pre = "".join([twobit_file[chr][e[0] : e[1]] for e in reversed(exonCodingRanges[:j])]) seq_ref_post = "".join([twobit_file[chr][e[0] : e[1]] for e in reversed(exonCodingRanges[j + 1 :])]) else: seq_ref = "".join([twobit_file[chr][e[0] : e[1]] for e in exonCodingRanges]) seq_ref_pre = "".join([twobit_file[chr][e[0] : e[1]] for e in exonCodingRanges[:j]]) seq_ref_post = "".join([twobit_file[chr][e[0] : e[1]] for e in exonCodingRanges[j + 1 :]]) for allele in alleles: seq = "" if strand == "-": seq = ( seq_ref_post + twobit_file[chr][exonCodingRanges[j][0] : (record.start - 1)] + allele + twobit_file[chr][record.end : exonCodingRanges[j][1]] + seq_ref_pre ) seq = reverse_complement(seq) else: seq = ( seq_ref_pre + twobit_file[chr][exonCodingRanges[j][0] : (record.start - 1)] + allele + twobit_file[chr][record.end : exonCodingRanges[j][1]] + seq_ref_post ) seq_var.append(seq) # Get variants amino_acid_changes = [] for i in range(len(alleles)): variant_descriptions = [] try: variant_descriptions = desc_variants(seq_ref, seq_var[i]) except AssertionError: continue if variant_descriptions: amino_acid_changes.append(variant_descriptions) if amino_acid_changes: return ("nonsynonymous coding", 1, " ".join(amino_acid_changes)) else: return ("synonymous coding",) exons.append([exonWCodeStarts[j], exonWCodeEnds[j]])
def infer_function(twobit_file, record, geneName, strand, cdsStart, cdsEnd, exonStarts, exonEnds): """ Infer "function" (as dbSNP calls it) given a reference TwoBitFile object, a GFFRecord object, and info about the gene: name, strand, coding sequence start, coding sequence end (both 0-based, half-open), exon starts (comma- separated string), and exon ends (comma-separated string). Returns a tuple consisting of the "function" (coding, 5'-UTR, etc.), followed by the exon or intron number (1-based, if applicable), and amino acid residue (1-based numeric type, if applicable) or change (1-based string, if applicable). """ # Check chromosome name if record.seqname.startswith("chr"): chr = record.seqname else: chr = "chr" + record.seqname # Check if it falls entirely outside the gene region if (record.strand == "+" and record.end <= cdsStart) or \ (record.strand == "-" and record.start > cdsEnd): return ("5'-UTR", ) if (record.strand == "+" and record.start > cdsEnd) or \ (record.strand == "-" and record.end <= cdsStart): return ("3'-UTR", ) # make exonStarts and exonEnds into lists # first, we have to make sure they're strings... try: exonStarts = exonStarts.tostring() exonEnds = exonEnds.tostring() # if we already have a string, tostring() won't work except AttributeError: pass # now, we really make them lists exonStarts = [long(e) for e in exonStarts.strip(",").split(",")] exonEnds = [long(e) for e in exonEnds.strip(",").split(",")] # make a list of all exons, in case we need it all_exons = zip(exonStarts, exonEnds) # reverse for strand; note how we set aside all_exons first before doing this if strand == "-": exonStarts.reverse() exonEnds.reverse() # Get coordinates of coding sequence exonWCodeStarts = [] exonWCodeEnds = [] exonCodingRanges = [] for j in range(len(exonStarts)): if (exonEnds[j] <= cdsStart or exonStarts[j] > cdsEnd): continue else: start = exonStarts[j] end = exonEnds[j] if start < cdsStart: start = long(cdsStart) if end > cdsEnd: end = long(cdsEnd) exonWCodeStarts.append(exonStarts[j]) exonWCodeEnds.append(exonEnds[j]) exonCodingRanges.append((start, end)) # parse out exons exons = [] exon_seqs = [] running_intron_count = running_exon_count = running_cds_bases_count = 0 # 1-based trimmed_bases = 0 for j in range(0, len(exonWCodeStarts)): # skip exons we know are noncoding (we reported UTR already earlier) if exonWCodeEnds[j] > cdsStart and exonWCodeStarts[j] <= cdsEnd: # Commenting out splice prediction for now - MPB 2010/12/03 ''' # check if it is spanning or within 2bp of the splice junction overlap_start = (record.start <= exonWCodeStarts[j] and record.end > exonWCodeStarts[j] - 2) \ and exonWCodeStarts[j] >= cdsStart overlap_end = (record.start <= exonWCodeEnds[j] + 2 and record.end > exonWCodeEnds[j]) \ and exonWCodeEnds[j] <= cdsEnd before_seq = after_seq = "" if overlap_start: if strand == "-": before_seq = reverse_complement("".join([twobit_file[chr][e[0]:e[1]] for e in reversed(exonCodingRanges[0:j+1])])) after_seq = reverse_complement(twobit_file[chr][exonCodingRanges[j+1][0]:exonCodingRanges[j+1][1]]) else: before_seq = "".join([twobit_file[chr][e[0]:e[1]] for e in exonCodingRanges[0:j]]) after_seq = twobit_file[chr][exonCodingRanges[j][0]:exonCodingRanges[j][1]] elif overlap_end: if strand == "-": before_seq = reverse_complement("".join([twobit_file[chr][e[0]:e[1]] for e in reversed(exonCodingRanges[0:j])])) after_seq = reverse_complement(twobit_file[chr][exonCodingRanges[j][0]:exonCodingRanges[j][1]]) else: before_seq = "".join([twobit_file[chr][e[0]:e[1]] for e in exonCodingRanges[0:j+1]]) after_seq = twobit_file[chr][exonCodingRanges[j+1][0]:exonCodingRanges[j+1][1]] if (overlap_start or overlap_end): desc = "" if len(before_seq) % 3 == 0: var1 = codon_1to3(translate(before_seq[-3:])) var2 = codon_1to3(translate(after_seq[:3])) pos = len(before_seq) / 3 desc = var1 + "-" + var2 + str(pos) + "-" + str(pos+1) + "Splice" else: aa = translate(before_seq + after_seq) var = codon_1to3(aa[(len(before_seq)/3)]) pos = 1 + len(before_seq) / 3 desc = var + str(pos) + "Splice" # print "Predicting splice start -, seq_ref: " + seq_ref + " next_exon: " + next_exon + " var: " + var + " desc: " + desc return ("splice site",1,desc) ''' # trim the start and end to the coding region if exonWCodeStarts[j] < cdsStart: trimmed_bases = cdsStart - exonWCodeStarts[j] exonWCodeStarts[j] = cdsStart if exonWCodeEnds[j] > cdsEnd: trimmed_bases = exonWCodeEnds[j] - cdsEnd exonWCodeEnds[j] = cdsEnd # check if it's in intron if len(exons) > 0: if strand == "+": intron_start = exons[-1][1] intron_end = exonWCodeStarts[j] else: intron_start = exonWCodeEnds[j] intron_end = exons[-1][0] running_intron_count += 1 # test if is in within intron if (record.start > intron_start and record.end <= intron_end): return ("intron", running_intron_count) # skip variants spanning start or end of coding region # (we haven't worked out how to report these yet) if (record.start <= cdsStart and record.end > cdsStart) or \ (record.start <= cdsEnd and record.end > cdsEnd) or \ (record.start - 1 == record.end == cdsStart) or \ (record.start == record.end + 1 == cdsEnd): return ("span_coding_edge", ) # skip variants spanning start or end of exon boundaries # (we haven't worked out how to report these yet) if (record.start <= exonWCodeStarts[j] and record.end > exonWCodeStarts[j]) or \ (record.start <= exonWCodeEnds[j] and record.end > exonWCodeEnds[j]): return ("span_exon_boundary", ) if ( (record.start > exonWCodeStarts[j] and record.start <= exonWCodeEnds[j]) \ and (record.end > exonWCodeStarts[j] and record.end <= exonWCodeEnds[j])): # get alleles and length is reference genome alleles = record.attributes["alleles"].strip("\"").split("/") for i in range(len(alleles)): if alleles[i] == "-": alleles[i] = "" ref_allele = record.attributes["ref_allele"] if ref_allele == "-": ref_allele = "" if (len(ref_allele) != record.end + 1 - record.start): sys.exit("Reference allele length doesn't match GFF positions! ref_allele: \"" \ + record.attributes["ref_allele"] + "\", start: " + str(record.start) + " end: " \ + str(record.end)) try: alleles.remove(ref_allele) except ValueError: pass # Generate reference and variant coding region DNA sequences seq_var = [] seq_ref = seq_ref_pre = seq_ref_post = "" if strand == "-": seq_ref = "".join([ twobit_file[chr][e[0]:e[1]] for e in reversed(exonCodingRanges) ]) seq_ref = reverse_complement(seq_ref) seq_ref_pre = "".join([ twobit_file[chr][e[0]:e[1]] for e in reversed(exonCodingRanges[:j]) ]) seq_ref_post = "".join([ twobit_file[chr][e[0]:e[1]] for e in reversed(exonCodingRanges[j + 1:]) ]) else: seq_ref = "".join([ twobit_file[chr][e[0]:e[1]] for e in exonCodingRanges ]) seq_ref_pre = "".join([ twobit_file[chr][e[0]:e[1]] for e in exonCodingRanges[:j] ]) seq_ref_post = "".join([ twobit_file[chr][e[0]:e[1]] for e in exonCodingRanges[j + 1:] ]) for allele in alleles: seq = "" if strand == "-": seq = seq_ref_post + twobit_file[chr][exonCodingRanges[j][0]:(record.start - 1)] \ + allele + twobit_file[chr][record.end:exonCodingRanges[j][1]] + seq_ref_pre seq = reverse_complement(seq) else: seq = seq_ref_pre + twobit_file[chr][exonCodingRanges[j][0]:(record.start - 1)] \ + allele + twobit_file[chr][record.end:exonCodingRanges[j][1]] + seq_ref_post seq_var.append(seq) # Get variants amino_acid_changes = [] for i in range(len(alleles)): variant_descriptions = [] try: variant_descriptions = desc_variants( seq_ref, seq_var[i]) except AssertionError: continue if (variant_descriptions): amino_acid_changes.append(variant_descriptions) if amino_acid_changes: return ("nonsynonymous coding", 1, " ".join(amino_acid_changes)) else: return ("synonymous coding", ) exons.append([exonWCodeStarts[j], exonWCodeEnds[j]])
def main(): # parse options option, args = doc_optparse.parse(__doc__) if len(args) < 2: doc_optparse.exit() flank = int(option.flank or 0) # try opening the file both ways, in case the arguments got confused try: gff_file = gff.input(args[1]) twobit_file = twobit.input(args[0]) except Exception: gff_file = gff.input(args[0]) twobit_file = twobit.input(args[1]) # initialize a set of variables to keep track of uniqueness, if we need them if option.unique: previous_record = None previous_ref_seq = None repetition_count = 1 for record in gff_file: # if we're using the unique option, output the previous record only when # we're sure we've seen all repetitions of it if option.unique and record == previous_record: repetition_count += 1 continue elif option.unique: if previous_record: previous_record.attributes["repetition_count"] = str(repetition_count) print FastaRecord(str(previous_record).replace("\t", "|"), previous_ref_seq) repetition_count = 1 previous_record = record if record.seqname.startswith("chr"): chr = record.seqname else: chr = "chr" + record.seqname ref_seq = twobit_file[chr][(record.start - 1):record.end] if flank != 0: # calculate the flanks (these variables are 0-based) left_flank_start = record.start - flank - 1 left_flank_end = record.start - 1 if left_flank_start < 0: left_flank_start = 0 right_flank_start = record.end right_flank_end = record.end + flank # now find them left_flank_seq = twobit_file[chr][left_flank_start:left_flank_end] right_flank_seq = twobit_file[chr][right_flank_start:right_flank_end] ref_seq = left_flank_seq + "\n\n" + ref_seq + "\n\n" + right_flank_seq if option.strand and record.strand == "-": ref_seq = reverse_complement(ref_seq) # we don't output the current record if we're using the unique option if option.unique: previous_ref_seq = ref_seq else: print FastaRecord(str(record).replace("\t", "|"), ref_seq) # we'll have one last record yet to output if we used the unique option if option.unique: previous_record.attributes["repetition_count"] = str(repetition_count) print FastaRecord(str(previous_record).replace("\t", "|"), previous_ref_seq)