def _buildAllele(allele_id, transcript, exons, introns, offsets, virtual_coordinates=False, reference_exons=None): def _getOffset(pos, offsets): x = 0 while x < len(offsets) and offsets[x][0] <= pos: x += 1 x -= 1 if x >= 0: return offsets[x][1] else: return 0 def _sumIndels(ss): '''sum indels within ss''' c = 0 for s in ss: c += len(s) - 1 return c def _getEndOffsets(ss): '''get the offset at exons due to deletions at start/end of exon.''' l = len(ss) x = 0 while x < l and ss[x] == "": x += 1 start_offset = x x = l - 1 while x >= 0 and ss[x] == "": x -= 1 if x >= 0: return start_offset, (l - 1) - x else: return start_offset, 0 def _addCds2Reference(map_cds2reference, cds_start, cds_seq, reference_start): '''add cds to reference''' c, r = cds_start, reference_start for x in cds_seq: l = len(x) if l == 0: r += 1 else: map_cds2reference.addPair(c, r) c += l r += 1 # counts is_splice_truncated = False is_nmd_knockout = False is_stop_truncated = False nuncorrected_frameshifts = 0 ncorrected_frameshifts = 0 nframeshifts = 0 nsplice_noncanonical = 0 reference_first_stop_start = -1 reference_first_stop_end = -1 # map between the new cds sequence and the reference # sequence map_cds2reference = alignlib_lite.py_makeAlignmentBlocks() ################################################### # process first exon exon = transcript[0] transcript_id = exon.transcript_id # collect offset for exon.start genome_start = exon.start genome_start += _getOffset(genome_start, offsets) lcds, cds = 0, [] cds_starts = [0] # still need to deal with deletions of first base: exon_starts = [genome_start] exon_key = (exon.start, exon.end) exon_sequence = exons[exon_key] exon_seq = "".join(exon_sequence) cds.append(exon_seq) _addCds2Reference(map_cds2reference, lcds, exon_sequence, exon.start) lcds = len(exon_seq) if len(exon_seq) != exon.end - exon.start: nframeshifts += 1 # add first exon to genome position genome_pos = genome_start + len(exon_seq) last_end = exon.end # correct for deletions at start/end of exon start_offset, end_offset = _getEndOffsets(exon_sequence) # length of original transcript loriginal = sum([x.end - x.start for x in transcript]) if E.global_options.loglevel >= 8: print("%i: exon_indels (%i-%i):" % (allele_id, exon.start, exon.end)) for x, c in enumerate(exons[exon_key]): if len(c) != 1: print(x + exon.start, ":%s:" % c) print() print(exons[exon_key]) print("genome_pos=", genome_pos, ",exon=%i-%i" % (genome_pos, genome_pos + len(exon_seq)), ", len(exon_seq)=", len(exon_seq), ", len(exon)=", exon.end - exon.start, ", offsets=%i,%i," % (start_offset, end_offset), ", offset at start=", getOffset(exon.start, offsets), ", offset at end=", getOffset(exon.end, offsets)) for exon in transcript[1:]: last_exon_sequence = exon_sequence last_start_offset, last_end_offset = start_offset, end_offset # get the next intron/exon parameters exon_key = (exon.start, exon.end) exon_sequence = exons[exon_key] start_offset, end_offset = _getEndOffsets(exon_sequence) intron_key = (last_end, exon.start) if last_end == exon.start: # catch empty introns intron_sequence = [] intron_key = None else: intron_sequence = introns[intron_key] intron_seq = "".join(intron_sequence) ################################################### ################################################### ################################################### # add preceding intron new_exon = True if len(intron_seq) > frameshiftsize: intron_name, intron_seq5, intron_seq3 = Genomics.GetIntronType( intron_seq) if intron_name == "unknown": if intron_seq[:2].islower() and intron_seq[-2:].islower(): E.debug( "%s: transcript has unknown splice signal - kept because not a variant: %s: %s:%s" % (transcript_id, intron_name, intron_seq5, intron_seq3)) nsplice_noncanonical += 1 else: is_splice_truncated = True E.debug( "%s: transcript has splice truncated allele: %s: %s:%s" % (transcript_id, intron_name, intron_seq5, intron_seq3)) break # start a new exon cds_starts.append(lcds) else: # treat as frameshifting intron # # frame-shifting introns are checked if they are # fixed by indels either in the intron itself or # the terminal exon sequence. To this end, the effective # size of the intron is computed: # effective size of intron = # indels at terminal x bases at previous exon # + size of intron # + indels at terminal x bases at next exon effective_intron_size = len(intron_seq) previous_indels = _sumIndels( last_exon_sequence[max(0, -frameshiftsize):]) next_indels = _sumIndels(exon_sequence[:frameshiftsize]) effective_intron_size += previous_indels + next_indels if previous_indels + next_indels == 0 and len( intron_seq) % 3 == 0: has_stop = "X" in Genomics.translate(intron_seq.upper(), is_seleno=is_seleno) else: has_stop = False if effective_intron_size % 3 == 0 and not has_stop: E.debug( "%s: fixed frame-shifting intron %i-%i of size %i (size:%i, indels:%i,%i)" % ( transcript_id, last_end, exon.start, effective_intron_size, len(intron_seq), previous_indels, next_indels, )) # add to previous exon cds.append(intron_seq) lcds += len(intron_seq) ncorrected_frameshifts += 1 new_exon = False else: E.debug( "%s: could not fix frame-shifting intron %i-%i of size %i (size:%i, indels:%i,%i, has_stop=%i)" % (transcript_id, last_end, exon.start, effective_intron_size, len(intron_seq), previous_indels, next_indels, has_stop)) nuncorrected_frameshifts += 1 # start a new exon cds_starts.append(lcds) if E.global_options.loglevel >= 8: print("%i: intron_indels (%i-%i):" % (allele_id, last_end, exon.start)) if intron_key: for x, c in enumerate(introns[intron_key]): if len(c) != 1: print(x + last_end, ":%s:" % c) print() print(introns[intron_key]) print( "genome_pos=", genome_pos, ",intron=%i-%i" % (genome_pos, genome_pos + len(intron_seq)), ", len(intron_seq)=", len(intron_seq), ", len(intron)=", exon.start - last_end, ", offset at start=", _getOffset(last_end, offsets), ", offset at end=", _getOffset(exon.start, offsets)) else: print("empty intron") genome_pos += len(intron_seq) # assertion - check if genomic coordinate of intron is consistent # with offset test_offset = _getOffset(exon.start, offsets) is_offset = genome_pos - exon.start assert is_offset == test_offset, "intron offset difference: %i != %i" % ( is_offset, test_offset) ################################################### ################################################### ################################################### # add the exon exon_seq = "".join(exon_sequence) cds.append(exon_seq) if len(exon_seq) != exon.end - exon.start: nframeshifts += 1 if new_exon: if reference_coordinates: exon_starts.append(exon.start + start_offset) else: exon_starts.append(genome_pos) _addCds2Reference(map_cds2reference, lcds, exon_sequence, exon.start) lcds += len(exon_seq) last_end = exon.end if E.global_options.loglevel >= 8: print("%i: exon_indels (%i-%i):" % (allele_id, exon.start, exon.end)) for x, c in enumerate(exons[exon_key]): if len(c) != 1: print(x + exon.start, ":%s:" % c) print() print(exons[exon_key]) print("genome_pos=", genome_pos, ",exon=%i-%i" % (genome_pos, genome_pos + len(exon_seq)), ", len(exon_seq)=", len(exon_seq), ", len(exon)=", exon.end - exon.start, ", offsets=%i,%i," % (start_offset, end_offset), ", offset at start=", getOffset(exon.start, offsets), ", offset at end=", getOffset(exon.end, offsets)) genome_pos += len(exon_seq) test_offset = _getOffset(exon.end, offsets) is_offset = genome_pos - exon.end assert is_offset == test_offset, "exon offset difference: %i != %i" % ( is_offset, test_offset) cds = "".join(cds) assert lcds == len(cds) # fix incomplete codons at the end of the sequence if lcds % 3 != 0: offset = lcds % 3 cds = cds[:-offset] # add frame correction for transcripts that do not start at frame=0 start_frame = (3 - (int(transcript[0].frame) % 3)) % 3 # n are ignored (? in sequence to deal with genes like Muc2) peptide = Genomics.translate("n" * start_frame + cds, is_seleno=is_seleno, prefer_lowercase=False, ignore_n=True) # find the first stop codon if start_frame != 0: # ignore first, potentially incomplete base pep_first_stop = peptide.upper().find("X", 1) else: pep_first_stop = peptide.upper().find("X") E.debug("%s: translated peptide = %s, first stop at %i" % (transcript_id, peptide, pep_first_stop)) peptide = peptide.replace("?", "x") if E.global_options.loglevel >= 8: E.debug("peptide=%s" % peptide) E.debug("cds=%s" % cds) E.debug("%s: start_frame=%i, first stop at %i/%i" % (transcript_id, start_frame, pep_first_stop, len(peptide))) lpeptide, lcds = len(peptide), len(cds) # check for non-sense mediated decay if pep_first_stop != -1: cds_first_stop = pep_first_stop * 3 - start_frame if cds_first_stop < cds_starts[-1]: if ncorrected_frameshifts or nuncorrected_frameshifts: E.warn( "nmd knockout transcript %s has frameshifts: %i corrected, %i uncorrected" % (transcript_id, ncorrected_frameshifts, nuncorrected_frameshifts)) is_nmd_knockout = True cds = peptide = "" lpeptide, lcds = 0, 0 reference_first_stop_start, reference_first_stop_end = \ (map_cds2reference.mapRowToCol(cds_first_stop), map_cds2reference.mapRowToCol(cds_first_stop + 3)) elif pep_first_stop < len(peptide) - 1: is_stop_truncated = True cds = cds[:cds_first_stop] peptide[:pep_first_stop] lpeptide, lcds = len(peptide), len(cds) reference_first_stop_start, reference_first_stop_end = \ (map_cds2reference.mapRowToCol(cds_first_stop), map_cds2reference.mapRowToCol(cds_first_stop + 3)) else: E.warn( "first stop at %i(cds=%i) ignored: last exon start at %i" % (pep_first_stop, cds_first_stop, cds_starts[-1])) else: # -1 for no stop codon found pep_first_stop = -1 cds_first_stop = -1 lpeptide, lcds = len(peptide), len(cds) if peptide is None and nframeshifts == 0: E.warn( "transcript %s is knockout, though there are no indels - must be nonsense mutation" % (transcript_id)) # build frames frames = [start_frame] start = start_frame l = 0 for end in cds_starts[1:]: l += end - start frames.append((3 - l % 3) % 3) start = end return Allele._make(( cds, peptide, len(cds_starts), cds_starts, exon_starts, frames, is_nmd_knockout, is_splice_truncated, is_stop_truncated, nframeshifts, ncorrected_frameshifts, nuncorrected_frameshifts, pep_first_stop, lpeptide, cds_first_stop, lcds, reference_first_stop_start, reference_first_stop_end, loriginal, nsplice_noncanonical, )), map_cds2reference
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: gpipe/predictions2introns.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("-o", "--output-filename-summary", dest="output_filename_summary", type="string", help="filename with summary information.") parser.add_option("--skip-header", dest="skip_header", action="store_true", help="skip header.") parser.add_option( "--fill-introns", dest="fill_introns", type="int", help= "fill intron if divisible by three and no stop codon up to a maximum length of #." ) parser.add_option( "--introns-max-stops", dest="introns_max_stops", type="int", help="maximum number of stop codons to tolerate within an intron.") parser.add_option("--output-format", dest="output_format", type="choice", choices=("predictions", "extensions", "filled-introns"), help="output format.") parser.set_defaults( genome_file="genome", start_codons=("ATG"), stop_codons=("TAG", "TAA", "TGA"), skip_header=False, ) (options, args) = E.Start(parser, add_pipe_options=True) if len(args) > 0: print USAGE, "no arguments required." sys.exit(2) fasta = IndexedFasta.IndexedFasta(options.genome_file) p = PredictionParser.PredictionParserEntry() ninput, noutput = 0, 0 nfilled = 0 nseqs_filled = 0 nseqs_extended = 0 left_extensions = [] right_extensions = [] filled_introns = [] if not options.skip_header: options.stdout.write("\t".join(( "prediction_id", "intron", "contig", "strand", "start", "end", "length", "nstops", "type", "prime5", "prime3", )) + "\n") for line in sys.stdin: if line[0] == "#": continue ninput += 1 p.Read(line) lsequence = fasta.getLength(p.mSbjctToken) genomic_sequence = fasta.getSequence(p.mSbjctToken, p.mSbjctStrand, p.mSbjctGenomeFrom, p.mSbjctGenomeTo).upper() exons = Exons.Alignment2Exons(p.mMapPeptide2Genome, query_from=0, sbjct_from=0) new_exons = [] last_e = exons[0] nintron = 0 for e in exons[1:]: nintron += 1 lintron = e.mGenomeFrom - last_e.mGenomeTo intron_is_l3 = lintron % 3 != 0 if intron_is_l3: ## get sequence, include also residues from split codons ## when checking for stop codons. ## note that e.mAlignment can sometimes be empty. This might ## be an exonerate bug. In the alignment string there are two ## consecutive exons. if e.mAlignment and last_e.mAlignment and e.mAlignment[0][ 0] == "S": offset_left = last_e.mAlignment[-1][2] offset_right = e.mAlignment[0][2] else: offset_left, offset_right = 0, 0 sequence = genomic_sequence[last_e.mGenomeTo - offset_left:e.mGenomeFrom + offset_right] intron_nstops = 0 for codon in [ sequence[x:x + 3] for x in range(0, len(sequence), 3) ]: if codon in options.stop_codons: intron_nstops += 1 else: intron_nstops = 0 ## check for splice signals sequence = genomic_sequence[last_e.mGenomeTo:e.mGenomeFrom] intron_type, prime5, prime3 = Genomics.GetIntronType(sequence) if options.loglevel >= 2: options.stdlog.write( "\t".join(map(str, (p.mPredictionId, nintron, lintron, intron_nstops, intron_type, genomic_sequence[last_e.mGenomeTo-6:last_e.mGenomeTo].lower() + "|" + sequence[:5] + "..." +\ sequence[-5:] + "|" + genomic_sequence[e.mGenomeFrom:e.mGenomeFrom+6].lower()) ) ) + "\n" ) options.stdout.write("\t".join( map(str, (p.mPredictionId, nintron, p.mSbjctToken, p.mSbjctStrand, last_e.mGenomeTo + p.mSbjctGenomeFrom, e.mGenomeFrom + p.mSbjctGenomeFrom, lintron, intron_nstops, intron_type, prime5, prime3))) + "\n") last_e = e noutput += 1 if options.loglevel >= 1: options.stdlog.write("# ninput=%i, noutput=%i.\n" % (\ ninput, noutput)) E.Stop()