示例#1
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser.add_option("-m", "--filename-map", dest="filename_map", type="string",
                      help="filename with mapping information.")
    parser.add_option("-o", "--pattern-old", dest="pattern_old", type="string",
                      help="pattern for mapping new to old identifiers: extract string from old.")
    parser.add_option("-n", "--pattern-new", dest="pattern_new", type="string",
                      help="pattern for mapping new to old identifiers: put string into new.")
    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="genome_file.")
    parser.add_option("-p", "--peptides", dest="filename_peptides", type = "string",
                      help="filename with peptide sequences.")
    parser.add_option("-f", "--input-format", dest="input_format", type="choice",
                      help="format of mapping file", choices=("alignment", "offsets") )
    parser.add_option("-i", "--write-missed", dest="write_missed", type="string",
                      help="write missed identifiers to separate file.")
    parser.add_option("-a", "--filename-genes", dest="filename_genes", type="string",
                      help="filename with gene information.")
    parser.add_option("--filename-old-peptides", dest="filename_old_peptides", type="string",
                      help="filename with old peptide information.")
    parser.add_option("--no-renumber", dest="renumber", action="store_false",
                      help="do not renumber predictions.")
    parser.add_option("--contig-sizes-old", dest="contig_sizes_old", type="string",
                      help="contig sizes for old data.")
    parser.add_option("--contig-sizes-new", dest="contig_sizes_new", type="string",
                      help="contig sizes for new data.")
    parser.add_option("--skip-errors", dest="skip_errors", action="store_true",
                      help="skip entries with errors.")
    
    parser.set_defaults(
        filename_map = None,
        pattern_old = "(.+)",
        pattern_new = "%s",
        genome_file = None,
        filename_peptides = None,
        write_missed = None,
        filename_genes = None,
        filename_old_peptides = None,
        renumber = True,
        input_format = "alignment",
        contig_sizes_old = None,
        contig_sizes_new = None,
        skip_errors = None
        )

    (options, args) = E.Start( parser, add_pipe_options = True)

    predictor = PredictorExonerate()

    ## the different mapping criteria
    map_sbjcts = {}
    breakpoints = {}

    ################################################################################################
    map_transcript2gene = {}
    if options.filename_genes:
        infile = open(options.filename_genes, "r")
        for gene, transcript in map( lambda x: x[:-1].split("\t")[:2], filter( lambda x: x[0] != "#", infile.readlines())):
            map_transcript2gene[transcript] = gene
        infile.close()

    ################################################################################################
    peptides = {}
    if options.filename_peptides:
        peptides = Genomics.ReadPeptideSequences( open(options.filename_peptides, "r"))
        options.stdlog.write( "# read %i peptide sequences.\n" % len(peptides))

    ################################################################################################
    ## read old query sequences and compare against new query sequences
    ## this can be used to build a map between old and new queries
    query_map_old2new = {}        
    if options.filename_old_peptides:
        old_peptides = Genomics.ReadPeptideSequences( open(options.filename_old_peptides, "r"))
        options.stdlog.write( "# read %i old peptide sequences.\n" % len(old_peptides))
        query_map_old2new, unmappable, unmapped = Genomics.MapSequences( old_peptides, peptides)
        options.stdlog.write( "# built map: unmappable=%i unmapped=%i.\n" % (len(unmappable), len(unmapped)))
        if options.loglevel >= 2:
            options.stdlog.write( "# unmappable: %s.\n" % ";".join(unmappable))
            options.stdlog.write( "# unmapped: %s.\n" % ";".join(unmapped))            

    ################################################################################################
    ## read old/new contig sizes for mapping positive/negative coordinates
    contig_sizes_old = {}
    contig_sizes_new = {}
    if options.contig_sizes_old:
        contig_sizes_old = Genomics.ReadContigSizes( open(options.contig_sizes_old, "r") )
    if options.contig_sizes_new:
        contig_sizes_new = Genomics.ReadContigSizes( open(options.contig_sizes_new, "r") )
        
    ################################################################################################        
    if options.filename_map:
        
        infile = open(options.filename_map)
        if options.input_format == "alignments":
            for line in infile:
                if line[0] == "#": continue

                x, old_token, old_from, old_to, old_ali, new_from, new_to, new_ali = line[:-1].split("\t")

                map_sbjcts[old_token] = (old_from, old_ali, new_from, new_ali)

            if options.loglevel >= 1:
                options.stdlog.write( "# read %i alignments.\n" % len(map_sbjcts))

        elif options.input_format == "offsets":
            ## input is a list of segments and their offsets.

            breakpoints, endpoints, offsets = ReadOffsets( infile )
            if options.loglevel >= 1:
                options.stdlog.write( "# read breakpoints for %i chromosomes.\n" % len(breakpoints))

        infile.close()
        
    ################################################################################################
    ################################################################################################
    ################################################################################################
    ## end of input section
    ################################################################################################
    ################################################################################################
    ################################################################################################        

    rx = re.compile(options.pattern_old)
    last_sbjct_token = None
    ninput = 0
    nerrors = 0
    nerrors_map = 0
    nerrors_inconsistencies = 0
    nerrors_boundaries = 0
    nerrors_translation = 0
    nerrors_inconsequential = 0
    nerrors_realigned = 0
    nmapped = 0
    nfiltered = 0
    naligned = 0
    noutput = 0
    found_transcripts = {}
    nduplicates = 0
    output = {}
    
    for line in sys.stdin:
        if line[0] == "#": continue
        
        entry = PredictionParser.PredictionParserEntry()

        entry.Read( line )
        
        ninput += 1
        is_positive = entry.mSbjctStrand == "+"
        
        is_error = False
        
        ## check if query token is mappable: using sequence map
        if (query_map_old2new and entry.mQueryToken not in query_map_old2new):
            options.stdlog.write("# skipping prediction %i: obsolete query %s\n" % (entry.mPredictionId, entry.mQueryToken) )
            nfiltered += 1
            continue
        else:
            ## check if query token is mappable: using filter        
            if (peptides and entry.mQueryToken not in peptides):
                options.stdlog.write("# skipping prediction %i: obsolete query %s\n" % (entry.mPredictionId, entry.mQueryToken) )
                nfiltered += 1
                continue

        new_sbjct_token = options.pattern_new % rx.search(entry.mSbjctToken).groups()[0]

        ##########################################################################################################
        ## Map via alignments
        if entry.mSbjctToken in map_sbjcts:
            nmapped += 1
            if last_sbjct_token != entry.mSbjctToken:
                old_from, old_ali, new_from, new_ali = map_sbjcts[entry.mSbjctToken]
                map_a2b = alignlib_lite.makeAlignmentVector()
                alignlib_lite.AlignmentFormatExplicit(
                    int(old_from), old_ali,
                    int(new_from), new_ali).copy( map_a2b )
                
            last_sbjct_token = entry.mSbjctToken
            
            if options.loglevel >= 3:
                print "#", str(entry)
                print "#", map_sbjcts[entry.mSbjctToken]
                sys.stdout.flush()

            old_f, old_t = entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo

            ## convert to forward coordinates:
            if is_positive:
                f, t= old_f, old_t
                first_res, last_res = f + 1, t                
            else:
                f, t = map_a2b.getRowTo() - old_f, map_a2b.getRowTo() - old_t 
                first_res, last_res = f, t + 1 
            
            ## map first and last residues
            mfirst_res = map_a2b.mapRowToCol( first_res )
            mlast_res = map_a2b.mapRowToCol( last_res )

            if (mfirst_res == 0 and old_f != 0) or (mlast_res == 0 and old_t != map_a2b.getRowTo() ):
                
                options.stderr.write("# mapping not possible for prediction %i on %s %s:%i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i\n" % \
                                     (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand,
                                      old_f, old_t,
                                      f, t,
                                      first_res, last_res,
                                      mfirst_res, mlast_res,
                                      f, t))
                
                options.stderr.write("# %s\n" % str(map_sbjcts[entry.mSbjctToken]))
                options.stderr.write("# %s\n" % str(entry))                
                options.stderr.flush()                
                nerrors_boundaries += 1
                is_error = True

                ## get extended boundaries for alignment later on
                while mfirst_res == 0 and first_res > 1:
                    first_res -= 1
                    mfirst_res = map_a2b.mapRowToCol(first_res)
                while mlast_res == 0 and last_res < map_a2b.getRowTo():
                    last_res += 1
                    mlast_res = map_a2b.mapRowToCol(last_res)

            ## convert to genomic coordinates            
            ## convert negative strand coordinates
            if is_positive:
                new_f = mfirst_res - 1
                new_t = mlast_res 
            else:
                new_f = mfirst_res
                new_t = mlast_res - 1
                
                new_f = map_a2b.getColTo() - new_f
                new_t = map_a2b.getColTo() - new_t

            ## Now map the alignment.
            try:
                MapAlignment( entry, map_a2b )
                
            except ValueError:
                options.stderr.write("# alignment mapping not possible for prediction %i on %s %s:%i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i\n" % \
                                     (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand,
                                      old_f, old_t,
                                      f, t,
                                      first_res, last_res,
                                      mfirst_res, mlast_res,
                                      new_f, new_t,
                                      entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo))
                options.stderr.write("# %s\n" % str(map_sbjcts[entry.mSbjctToken]))
                options.stderr.flush()
                nerrors_map += 1
                is_error= True
            
            if new_f != entry.mSbjctGenomeFrom or new_t != entry.mSbjctGenomeTo:
                options.stderr.write("# mapping inconsistency for prediction %i on %s %s:%i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i <> %i-%i\n" % \
                                     (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand,
                                      old_f, old_t,
                                      f, t,
                                      first_res, last_res,
                                      mfirst_res, mlast_res,                                      
                                      new_f, new_t,
                                      entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo))
                
                nerrors_inconsistencies += 1
                is_error = True

        ##########################################################################################################
        ## Map via offsets
        if entry.mSbjctToken in breakpoints:

            old_f, old_t = entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo

            ## convert to forward coordinates:
            if is_positive:
                f, t= old_f, old_t
            else:
                f, t = contig_sizes_old[entry.mSbjctToken] - old_t, contig_sizes_old[entry.mSbjctToken] - old_f

            o1 = GetOffset( f,
                            breakpoints[entry.mSbjctToken],
                            endpoints[entry.mSbjctToken],
                            offsets[entry.mSbjctToken] )
            o2 = GetOffset( t,
                            breakpoints[entry.mSbjctToken],
                            endpoints[entry.mSbjctToken],
                            offsets[entry.mSbjctToken] )            

            if o1 != o2:
                options.stderr.write("# break within gene %s\n" % str(entry))
                nerrors_map += 1
                is_error = True
                
            f += o1
            t += o2

            if not is_positive:
                f, t = contig_sizes_new[entry.mSbjctToken] - t, contig_sizes_new[entry.mSbjctToken] - f

            entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo = f, t

            if entry.mSbjctGenomeFrom > entry.mSbjctGenomeTo:
                options.stderr.write("# mapping error: start after end %s\n" % str(entry))
                nerrors_map += 1
                is_error = True
        
        ##########################################################################################################
        ## do translation check, if genome is given
        if options.genome_file:
            genomic_sequence = Genomics.GetGenomicSequence( new_sbjct_token, entry.mSbjctStrand,
                                                            entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo,
                                                            options.genome_file,
                                                            loglevel = 0)

            map_peptide2translation, translation = Genomics.Alignment2PeptideAlignment( \
                entry.mMapPeptide2Genome, entry.mQueryFrom, 0, genomic_sequence )

            if re.sub("X", "", translation) != re.sub("X", "", entry.mTranslation):
                options.stderr.write("# translation error for prediction %i on %s %s:%i-%i -> %i-%i <> %i-%i\n" % \
                                     (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand,
                                      old_f, old_t,
                                      f, t,
                                      entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo))
                if map_sbjcts:
                    options.stderr.write("# %s\n" % str(map_sbjcts[entry.mSbjctToken]))
                options.stderr.write("# old=%s\n# new=%s\n" % (entry.mTranslation, translation))
                options.stderr.write("# old=%s\n# new=%s\n" % (entry.mAlignmentString, Genomics.Alignment2String(entry.mMapPeptide2Genome)))                    
                nerrors_translation += 1
                is_error = True

                if peptides and entry.mQueryToken in peptides:
                    naligned += 1

                    options.stdlog.write( "# aligning: %s versus %s:%s: %i-%i\n" % ( \
                        entry.mQueryToken,
                        new_sbjct_token, entry.mSbjctStrand,
                        entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo))
                    
                    # do a quick reprediction
                    if entry.mQueryToken in peptides:
                        genomic_sequence = Genomics.GetGenomicSequence( new_sbjct_token, entry.mSbjctStrand,
                                                                        0, 0,
                                                                        genome_file = options.genome_pattern,                                                                        
                                                                        loglevel = 0)
                        predictor.mLogLevel = 0

                        
                        result =  predictor(entry.mQueryToken, peptides[entry.mQueryToken],
                                            entry.mSbjctToken, genomic_sequence,
                                            "--exhaustive --subopt FALSE --score '%s' " % str(80),
                                            new_f - 10, new_t + 10)
                        prediction_id = entry.mPredictionId
                        if result:
                            entry = result[0]
                            entry.mPredictionId = prediction_id
                            nerrors_realigned += 1
            else:
                if is_error:
                    nerrors_inconsequential += 1
                    
        entry.mSbjctToken = new_sbjct_token

        ## map query tokens
        if query_map_old2new:
            query_tokens = query_map_old2new[entry.mQueryToken]
        else:
            query_tokens = (entry.mQueryToken,)

        if options.skip_errors and is_error:
            continue

        for query_token in query_tokens:

            entry.mQueryToken = query_token
            
            prediction_id = entry.mPredictionId
            entry.mPredictionId = 0
            
            hid = Genomics.GetHID( str(entry) )
            if hid in output:
                nduplicates += 1
                continue
            
            noutput += 1                        
            if options.renumber: prediction_id = noutput

            entry.mPredictionId = prediction_id

            options.stdout.write( str(entry) + "\n")
            options.stdout.flush()
            found_transcripts[entry.mQueryToken] = 1

    ## write out found transcripts and genes
    nmissed_transcripts = 0
    missed_transcripts = []
    found_genes = {}
    if peptides:
        for x in peptides.keys():
            if x not in found_transcripts:
                nmissed_transcripts += 1
                missed_transcripts.append( x )
            else:
                found_genes[map_transcript2gene[x]] = 1

    missed_genes = {}
    nmissed_genes = 0
    if map_transcript2gene:

        for t in missed_transcripts:
            g = map_transcript2gene[t]
            if g not in found_genes:
                missed_genes[g] = 1
        nmissed_genes = len(missed_genes)
    
    if options.write_missed:
        outfile = open(options.write_missed, "w")
        for x in missed_transcripts:
            if x in unmapped:
                status = "unmapped"
            else:
                status = "mapped"
            outfile.write( "%s\t%s\t%s\n" % ("transcript", x, status ))
        for x in missed_genes:
            status = "unknown"
            outfile.write( "%s\t%s\t%s\n" % ("gene", x, status ))
        
        outfile.close()
        
    options.stdlog.write("# input=%i, output=%i, filtered=%i, nduplicates=%i, mapped=%i, errors=%i\n" % (\
         ninput, noutput, nfiltered, nduplicates, nmapped, nerrors ))
    options.stdlog.write("# errors: inconsequental=%i, boundaries=%i, mapping=%i, inconsistencies=%i, translation=%i, realigned=%i\n" % (\
       nerrors_inconsequential, nerrors_boundaries, nerrors_map, nerrors_inconsistencies, nerrors_translation, nerrors_realigned ))
    options.stdlog.write("# peptides: input=%i, found=%i, missed=%i, found_genes=%i, missed_genes=%i\n" % (\
        len(peptides), len(found_transcripts), nmissed_transcripts, len(found_genes), nmissed_genes) )
    
    E.Stop()
示例#2
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: gpipe/predictions2transcripts.py 1841 2008-05-08 12:07:13Z andreas $",
                                    usage = globals()["__doc__"] )
    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome."  )

    parser.add_option("-o", "--output-filename-summary", dest="output_filename_summary", type="string",
                      help="filename with summary information."  )

    parser.add_option( "--skip-header", dest="skip_header", action="store_true",
                       help="skip header."  )

    parser.add_option( "--start-codon-boundary", dest="start_codon_boundary", type="int",
                      help="maximum extension for start codon (make divisible by 3)."  )
    
    parser.add_option( "--stop-codon-boundary", dest="stop_codon_boundary", type="int",
                      help="maximum extension for stop codon (make divisible by 3)."  )

    parser.add_option( "--left-extension-mode", dest="left_extension_mode", type="choice",
                       choices=("first-start", "first-stop-backtrack"),
                       help="extension mode for 5' end.")

    parser.add_option( "--fill-introns", dest="fill_introns", type="int",
                      help="fill intron if divisible by three and no stop codon up to a maximum length of #."  )

    parser.add_option( "--introns-max-stops", dest="introns_max_stops", type="int",
                      help="maximum number of stop codons to tolerate within an intron."  )

    parser.add_option( "--output-format", dest="output_format", type="choice",
                       choices=("predictions", "extensions", "filled-introns"),
                      help="output format."  )
    
    parser.set_defaults(
        genome_file = "genome",
        start_codons = ("ATG"),
        stop_codons = ("TAG", "TAA", "TGA"),
        start_codon_boundary = 9999,
        stop_codon_boundary  = 9999,
        fill_introns = 0,
        introns_max_stops = 0,
        left_splice_signals = ("GT",),
        right_splice_signals = ("AG",),
        output_format="extensions",
        left_extension_mode = "first-start",
        skip_header = False,
        output_filename_summary = None,
        )

    (options, args) = E.Start( parser, add_pipe_options = True )

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(2)

    options.start_codon_boundary = int(options.start_codon_boundary / 3)
    options.stop_codon_boundary = int(options.stop_codon_boundary / 3)

    fasta = IndexedFasta.IndexedFasta( options.genome_file )
    
    p = PredictionParser.PredictionParserEntry()

    ninput, noutput = 0, 0
    nfilled = 0
    nseqs_filled = 0
    nseqs_extended = 0
    left_extensions = []
    right_extensions = []
    filled_introns = []

    if not options.skip_header:
        if options.output_format == "predictions":
            options.stdout.write( Prediction.Prediction().getHeader() + "\n" )
        elif options.output_format == "filled-introns":
            options.stdout.write("\t".join( ("prediction_id",
                                             "intron",
                                             "peptide_sequence",
                                             "genomic_sequence") ) + "\n" )

    if options.output_filename_summary:
        outfile_summary = open (options.output_filename_summary, "w" )
        outfile_summary.write( "id\ttype\tnumber\tlength\tfrom\tto\tsequence\n" )
    else:
        outfile_summary = None

    for line in options.stdin:
        
        if line[0] == "#": continue

        ninput += 1
        p.Read(line)

        lsequence = fasta.getLength( p.mSbjctToken )

        genome_from = max( 0, p.mSbjctGenomeFrom - options.start_codon_boundary)
        genome_to = min( lsequence, p.mSbjctGenomeTo + options.stop_codon_boundary)
        
        genomic_sequence = fasta.getSequence( p.mSbjctToken, p.mSbjctStrand,
                                              genome_from,
                                              genome_to ).upper()

        ########################################################################
        ########################################################################
        ########################################################################            
        ## Do extensions
        
        if options.start_codon_boundary or options.stop_codon_boundary:
            
            extension_start = p.mSbjctGenomeFrom - genome_from 
            extension_stop  = genome_to - p.mSbjctGenomeTo
            
            fragment_to = extension_start + p.mSbjctGenomeTo - p.mSbjctGenomeFrom

            lfragment = len(genomic_sequence)

            ########################################################################
            ########################################################################
            ########################################################################            
            ## find start codon
            start = extension_start
            found_start = False
            if options.left_extension_mode == "first-start":

                found_start, start = findCodonReverse( genomic_sequence,
                                                       start,
                                                       options.start_codons,
                                                       options.stop_codons )
                
            elif options.left_extension_mode == "first-stop-backtrack":

                if genomic_sequence[start:start+3] in options.start_codons:
                    found_start = True
                else:
                    found_start, start = findCodonReverse( genomic_sequence,
                                                           start,
                                                           options.stop_codons )
                    
                    if found_start:
                        E.info("prediction %s: stop found at %i (%i) backtracking ..." % ( p.mPredictionId, start, extension_start - start) )
                        
                        ## bracktrack to first start codon
                        found_start = False
                        while start < extension_start:
                            start += 3
                            if genomic_sequence[start:start+3] in options.start_codons:
                                found_start = True
                                break
                        else:
                            start = extension_start

                        if found_start:
                            E.info("start codon found at %i (%i)." % ( start, extension_start - start) )
                        else:
                            E.info("no start codon found." )
                    else:
                        E.info("prediction %s: no stop found ... backtracking to start codon." % ( p.mPredictionId ) )

                        found_start, start = findCodonReverse( genomic_sequence, start, options.start_codons )

                        E.info("prediction %s: no start codon found." % ( p.mPredictionId ) )

            if found_start:
                start += genome_from
            else:
                start = p.mSbjctGenomeFrom

            dstart = p.mSbjctGenomeFrom - start
            
            ########################################################################
            ########################################################################
            ########################################################################            
            ## find stop codon
            ## stop points to the beginning of the codon, thus the stop codon will
            ## not be part of the sequence.
            stop = fragment_to
            found_stop = 0
            while stop < lfragment and \
                      genomic_sequence[stop:stop+3] not in ("NNN", "XXX"):
                if genomic_sequence[stop:stop+3] in options.stop_codons:
                    found_stop = 1
                    break

                stop += 3

            if found_stop:
                stop += genome_from 
            else:
                stop = p.mSbjctGenomeTo

            dstop = stop - p.mSbjctGenomeTo 

            ########################################################################
            ########################################################################
            ########################################################################            
            ## build new prediction
            map_peptide2genome = []
            if dstart: map_peptide2genome.append( ("G", 0, dstart) )
            map_peptide2genome += p.mMapPeptide2Genome
            if dstop: map_peptide2genome.append( ("G", 0, dstop) )

            E.info("prediction %s: extension: found_start=%i, found_stop=%i, left=%i, right=%i" % ( p.mPredictionId, found_start, found_stop, dstart, dstop ) )

            ## save results
            p.mMapPeptide2Genome = map_peptide2genome
            p.mAlignmentString = Genomics.Alignment2String( map_peptide2genome )
            p.mSbjctGenomeFrom -= dstart
            p.mSbjctGenomeTo += dstop
            p.mSbjctFrom += dstart / 3
            p.mSbjctTo += dstart / 3 + dstop / 3            
            
            if dstart or dstop:
                if dstart: left_extensions.append( dstart )
                if dstop: right_extensions.append( dstop )
                
                nseqs_extended += 1

        ## update genomic sequence because borders might have changed.
        genomic_sequence = fasta.getSequence( p.mSbjctToken,
                                              p.mSbjctStrand,
                                              p.mSbjctGenomeFrom,
                                              p.mSbjctGenomeTo ).upper()

        if options.fill_introns:
            
            has_filled = False

            exons = Exons.Alignment2Exons( p.mMapPeptide2Genome,
                                           query_from = 0,
                                           sbjct_from = 0 )

            new_exons = []

            last_e = exons[0]

            nintron = 0

            for e in exons[1:]:

                nintron += 1
                lintron = e.mGenomeFrom - last_e.mGenomeTo
                
                if lintron > options.fill_introns or (lintron) % 3 != 0:
                    E.debug( "prediction %s: intron %i of size %i discarded." % \
                                 (p.mPredictionId,
                                  nintron, lintron ) )
                    
                    new_exons.append(last_e)
                    last_e = e
                    continue

                ## get sequence, include also residues from split codons
                ## when checking for stop codons.
                if e.mAlignment[0][0] == "S":
                    offset_left = last_e.mAlignment[-1][2]
                    offset_right = e.mAlignment[0][2]
                else:
                    offset_left, offset_right = 0, 0
                    
                sequence = genomic_sequence[last_e.mGenomeTo - offset_left:e.mGenomeFrom+offset_right]
                
                ## check for splice sites
                for signal in options.left_splice_signals:
                    if sequence[offset_left:offset_left+len(signal)] == signal:
                        left_signal = True
                        break
                else:
                    left_signal = False
                    
                for signal in options.right_splice_signals:
                    if sequence[-(len(signal)+offset_right):-offset_right] == signal:
                        right_signal = True
                        break
                else:
                    right_signal = False

                nstops, ngaps = 0, 0
                for codon in [ sequence[x:x+3] for x in range(0,len(sequence),3) ]:
                    if codon in options.stop_codons: nstops += 1
                    if "N" in codon.upper(): ngaps += 1
                        
                    E.debug( "prediction %s: intron %i of size %i (%i-%i) (%s:%s:%i:%i): stops=%i, gaps=%i, signals=%s,%s." % \
                                 (p.mPredictionId,
                                  nintron, lintron,
                                  offset_left, offset_right,
                                  p.mSbjctToken, p.mSbjctStrand,
                                  p.mSbjctGenomeFrom + last_e.mGenomeTo,
                                  p.mSbjctGenomeFrom + e.mGenomeFrom,
                                  nstops,
                                  ngaps,
                                  left_signal, right_signal ) )

                if nstops + ngaps > options.introns_max_stops:
                    new_exons.append(last_e)                                        
                    last_e = e
                    continue
                
                E.info( "prediction %s: filling intron %i of size %i: stops=%i, gaps=%i, signals=%s,%s" % \
                            (p.mPredictionId,
                             nintron, lintron,
                             nstops,
                             ngaps,
                             left_signal, right_signal))

                e.Merge( last_e )
                has_filled = True
                nfilled += 1
                last_e = e

                if options.output_format == "filled-introns":
                    options.stdout.write( "\t".join( map(str, ( p.mPredictionId,
                                                                nintron,
                                                                Genomics.TranslateDNA2Protein( sequence ),
                                                                sequence ) ) ) + "\n" )
                                                                
                
                filled_introns.append(lintron)
                p.mNIntrons -= 1
                
            new_exons.append(last_e)

            if has_filled: nseqs_filled += 1

            Exons.UpdatePeptideCoordinates( new_exons )
            
            p.mMapPeptide2Genome = Exons.Exons2Alignment( new_exons )
            p.mAlignmentString = Genomics.Alignment2String( p.mMapPeptide2Genome )

        ## build translated sequence
        p.mMapPeptide2Translation, p.mTranslation = Genomics.Alignment2PeptideAlignment( \
               p.mMapPeptide2Genome, p.mQueryFrom, 0, genomic_sequence )

        ## output info
        if options.output_format == "predictions":
            options.stdout.write( str(p) + "\n" )
        elif options.output_format == "extensions":
            if found_start: found_start = 1
            if found_stop: found_stop = 1
            options.stdout.write( "\t".join( map(str, ( p.mPredictionId,
                                                        found_start, found_stop, 
                                                        dstart, dstop,
                                                        p.mTranslation,
                                                        p.mSbjctGenomeFrom, p.mSbjctGenomeTo,
                                                        p.mAlignmentString ))) + "\n" )

        noutput += 1
        options.stdout.flush()

    E.info("stats  : %s" % "\t".join(Stats.DistributionalParameters().getHeaders() ))
    E.info("left   : %s" % str(Stats.DistributionalParameters(left_extensions)) )
    E.info("right  : %s" % str(Stats.DistributionalParameters(right_extensions)) )
    E.info("introns: %s" % str(Stats.DistributionalParameters(filled_introns)) )        
    E.info("ninput=%i, noutput=%i, nextended=%i, nfilled=%i, nexons_filled=%i" % (\
            ninput, noutput, nseqs_extended, nseqs_filled, nfilled))
        
    E.Stop()