Exemplo n.º 1
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: gpipe/predictions2disruptions.py 2781 2009-09-10 11:33:14Z andreas $")

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome pattern."  )
    
    parser.add_option( "--start-codon-boundary", dest="start_codon_boundary", type="int",
                      help="maximum extension for start codon (make divisible by 3)."  )
    
    parser.add_option( "--stop-codon-boundary", dest="stop_codon_boundary", type="int",
                      help="maximum extension for stop codon (make divisible by 3)."  )

    
    parser.set_defaults(
        genome_file = "genome.fasta",
        stop_codons = ("TAG", "TAA", "TGA")
        )

    (options, args) = E.Start( parser, add_pipe_options = True )

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(2)

    p = PredictionParser.PredictionParserEntry()

    fasta = IndexedFasta.IndexedFasta( options.genome_file )
    
    for line in sys.stdin:
        
        if line[0] == "#": continue

        p.Read(line)

        genomic_sequence = fasta.getSequence( p.mSbjctToken, p.mSbjctStrand,
                                              p.mSbjctGenomeFrom, p.mSbjctGenomeTo )
        
        if options.loglevel >= 2:
            options.stdlog.write ("# parsing alignment %s\n" % p.mAlignmentString)
        try:
            nintrons, nframeshifts, ngaps, nsplits, nstopcodons, disruptions =\
                      Genomics.CountGeneFeatures( 0,
                                                  p.mMapPeptide2Genome,
                                                  genomic_sequence,
                                                  border_stop_codon = 0,
                                                  stop_codons = options.stop_codons )
        except ValueError, msg:
            options.stderr.write( "# parsing error: %s in line %s\n" % (line[:-1], msg))
            sys.exit(1)

        for type, \
                cds_pos_from, cds_pos_to, \
                genome_pos_from, genome_pos_to in disruptions:
            options.stdout.write( "\t".join(map(str, (p.mPredictionId,
                                                      type,
                                                      cds_pos_from, cds_pos_to,
                                                      genome_pos_from + p.mSbjctGenomeFrom,
                                                      genome_pos_to + p.mSbjctGenomeFrom) ) )+ "\n")

        options.stdout.flush()
Exemplo n.º 2
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gpipe/gff2predictions.py 2021 2008-07-10 16:00:48Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--trans",
                      dest="trans",
                      help="input is translated DNA.",
                      action="store_true")

    parser.add_option("-f",
                      "--format",
                      dest="format",
                      help="input format.",
                      type="choice",
                      choices=("exons", "psl", "gff"))

    parser.add_option("-o",
                      "--output-format",
                      dest="output_format",
                      help="output format",
                      type="choice",
                      choices=('exontable', 'exons', 'predictions', 'cds',
                               'fasta'))

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genomic data (indexed).")

    parser.add_option(
        "--predictions-file",
        dest="predictions_file",
        type="string",
        help=
        "filename with predictions. Use gene structures from this file if available."
    )

    parser.add_option("-i",
                      "--gff-field-id",
                      dest="gff_field_id",
                      type="string",
                      help="field for the feature id in the gff info section.")

    parser.add_option(
        "-p",
        "--filename-peptides",
        dest="filename_peptides",
        type="string",
        help=
        "Filename with peptide sequences. If given, it is used to check the predicted translated sequences."
    )

    parser.add_option(
        "--no-realignment",
        dest="do_realignment",
        action="store_false",
        help="do not re-align entries that do not parse correctly.")

    parser.add_option(
        "--remove-unaligned",
        dest="remove_unaligned",
        action="store_true",
        help="remove entries that have not been aligned correctly.")

    parser.add_option(
        "--input-coordinates",
        dest="input_coordinates",
        type="string",
        help=
        "specify input format for input coordinates [forward|both-zero|one-closed|open]."
    )

    parser.set_defaults(trans=False,
                        output_format="predictions",
                        format="psl",
                        gff_field_id='id',
                        input_coordinates="both-zero-open",
                        filename_peptides=None,
                        genome_file=None,
                        do_realignment=True,
                        predictions_file=None,
                        remove_unaligned=False)

    (options, args) = E.Start(parser)

    if not options.genome_file:
        raise "please specify a genome file."

    fasta = IndexedFasta.IndexedFasta(options.genome_file)
    contig_sizes = fasta.getContigSizes()

    ninput, noutput, nskipped = 0, 0, 0
    nfound, nnotfound, nidentical, nmismatch, naligned, nunaligned = 0, 0, 0, 0, 0, 0

    if options.filename_peptides:
        peptide_sequences = Genomics.ReadPeptideSequences(
            IOTools.openFile(options.filename_peptides, "r"))
        predictor = Predictor.PredictorExonerate()
        predictor.mLogLevel = 0
    else:
        peptide_sequences = None
        predictor = None

    converter = IndexedFasta.getConverter(options.input_coordinates)

    predictions = {}
    if options.predictions_file:
        parser = PredictionParser.iterator_predictions(
            IOTools.openFile(options.predictions_file, "r"))
        for p in parser:
            predictions[p.mPredictionId] = p

    if options.output_format == "predictions":

        if options.format == "psl":

            if options.trans:
                parser = PredictionParser.PredictionParserBlatTrans()
            else:
                parser = PredictionParser.PredictionParserBlatCDNA()

            nmatches = 1
            for line in sys.stdin:
                if line[0] == "#":
                    continue
                if not re.match("^[0-9]", line):
                    continue

                try:
                    entries = parser.Parse((line, ))
                except PredictionParser.AlignmentError, e:
                    print "# %s" % str(e)
                    print "#", line[:-1]
                    sys.exit(1)

                for entry in entries:
                    entry.mPredictionId = nmatches
                    nmatches += 1

                print str(entries)

        elif options.format == "exons":
            parser = PredictionParser.PredictionParserExons(
                contig_sizes=contig_sizes)
        else:
            raise "unknown format %s for output option %s" % (
                options.format, options.output_format)

        if options.loglevel >= 2:
            options.stdlog.write("# parsing.\n")
            options.stdlog.flush()

        results = parser.Parse(sys.stdin.readlines())

        if options.loglevel >= 2:
            options.stdlog.write("# parsing finished.\n")
            options.stdlog.flush()

        if options.loglevel >= 1:
            options.stdlog.write(
                "# parsing: ninput=%i, noutput=%i, nerrors=%i\n" %
                (parser.GetNumInput(), parser.GetNumOutput(),
                 parser.GetNumErrors()))

            for error, msg in parser.mErrors:
                options.stdlog.write("# %s : %s\n" % (str(error), msg))
                options.stdlog.flush()

        # if genomes are given: build translation
        if options.genome_file:

            results.Sort(lambda x, y: cmp(x.mSbjctToken, y.mSbjctToken))

            new_results = PredictionParser.Predictions()

            for entry in results:

                ninput += 1

                if options.loglevel >= 2:
                    options.stdlog.write(
                        "# processing entry %s:%s on %s:%s %i/%i.\n" %
                        (entry.mPredictionId, entry.mQueryToken,
                         entry.mSbjctToken, entry.mSbjctStrand, ninput,
                         len(results)))
                    options.stdlog.flush()

                try:
                    lgenome = fasta.getLength(entry.mSbjctToken)
                    # added 3 residues - was a problem at split codons just before the stop.
                    # See for example the chicken sequence ENSGALP00000002741
                    genomic_sequence = fasta.getSequence(
                        entry.mSbjctToken, entry.mSbjctStrand,
                        entry.mSbjctGenomeFrom,
                        min(entry.mSbjctGenomeTo + 3, lgenome))

                except KeyError:
                    if options.loglevel >= 1:
                        options.stdlog.write(
                            "# did not find entry for %s on %s.\n" %
                            (entry.mPredictionId, entry.mSbjctToken))
                    nskipped += 1
                    continue

                if predictions and entry.mPredictionId in predictions:
                    if options.loglevel >= 2:
                        options.stdlog.write(
                            "# substituting entry %s on %s:%s.\n" %
                            (entry.mPredictionId, entry.mSbjctToken,
                             entry.mSbjctStrand))
                        options.stdlog.flush()
                    entry = predictions[entry.mPredictionId]

                exons = Exons.Alignment2Exons(entry.mMapPeptide2Genome, 0,
                                              entry.mSbjctGenomeFrom)

                entry.mMapPeptide2Translation, entry.mTranslation = Genomics.Alignment2PeptideAlignment(
                    Genomics.String2Alignment(entry.mAlignmentString),
                    entry.mQueryFrom, 0, genomic_sequence)

                entry.score = entry.mMapPeptide2Translation.getColTo(
                ) - entry.mMapPeptide2Translation.getColFrom() + 1

                (entry.mNIntrons, entry.mNFrameShifts, entry.mNGaps, entry.mNSplits, entry.mNStopCodons, entry.mNDisruptions ) = \
                    Genomics.CountGeneFeatures(0,
                                               entry.mMapPeptide2Genome,
                                               genomic_sequence)

                if peptide_sequences:

                    if str(entry.mPredictionId) in peptide_sequences:

                        reference = peptide_sequences[str(
                            entry.mPredictionId)].upper()

                        translation = entry.mTranslation
                        nfound += 1

                        is_identical, nmismatches = checkIdentity(
                            reference, translation, options)

                        if is_identical:
                            nidentical += 1
                        else:
                            nmismatch += 1

                            if options.do_realignment:
                                if options.loglevel >= 2:
                                    options.stdlog.write(
                                        "# %s: mismatches..realigning in region %i:%i\n"
                                        % (entry.mPredictionId,
                                           entry.mSbjctGenomeFrom,
                                           entry.mSbjctGenomeTo))
                                    options.stdlog.flush()

                                    result = predictor(
                                        entry.mPredictionId, reference,
                                        entry.mSbjctToken, genomic_sequence,
                                        "--subopt FALSE --score '%s'" %
                                        str(80))
                                    # "--exhaustive --subopt FALSE --score '%s'" % str(80) )

                                    if result:
                                        translation = result[0].mTranslation
                                        is_identical, nmismatches = checkIdentity(
                                            reference, translation, options)
                                    else:
                                        if options.loglevel >= 2:
                                            options.stdlog.write(
                                                "# %s: realignment returned empty result\n"
                                                % (entry.mPredictionId))
                                            options.stdlog.flush()
                                        is_identical = False

                                    if is_identical:
                                        naligned += 1
                                        prediction_id = entry.mPredictionId
                                        sbjct_genome_from = entry.mSbjctGenomeFrom
                                        entry = result[0]
                                        entry.mPredictionId = prediction_id
                                        entry.mSbjctGenomeFrom += sbjct_genome_from
                                    else:
                                        nunaligned += 1
                                        if options.loglevel >= 1:
                                            options.stdlog.write(
                                                "# %s: mismatch on %s:%s:%i-%i after realignment\n# reference =%s\n# translated=%s\n# realigned =%s\n"
                                                %
                                                (entry.mPredictionId,
                                                 entry.mSbjctToken,
                                                 entry.mSbjctStrand,
                                                 entry.mSbjctGenomeFrom,
                                                 entry.mSbjctGenomeTo,
                                                 reference, entry.mTranslation,
                                                 translation))
                                            options.stdlog.flush()
                                        if options.remove_unaligned:
                                            nskipped += 1
                                            continue

                            else:
                                if options.loglevel >= 2:
                                    options.stdlog.write(
                                        "# %s: mismatches on %s ... no realignment\n"
                                        % (
                                            entry.mPredictionId,
                                            entry.mSbjctToken,
                                        ))
                                    if options.loglevel >= 3:
                                        options.stdlog.write(
                                            "# %s: mismatch before realignment\n# reference =%s\n# translated=%s\n"
                                            % (entry.mPredictionId, reference,
                                               translation))
                                    options.stdlog.flush()

                                if options.remove_unaligned:
                                    nskipped += 1
                                    continue

                    else:
                        nnotfound += 1

                new_results.append(entry)
                noutput += 1

            results = new_results
        if results:
            options.stdout.write(str(results) + "\n")
Exemplo n.º 3
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: gpipe/assignments2pairs.py 2011 2008-07-04 10:40:51Z andreas $", usage = globals()["__doc__"] )

    parser.add_option( "--peptides", dest="filename_peptides", type="string",
                      help=""  )

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome."  )

    parser.add_option("-s", "--suffix", dest="suffix", type="string",
                      help=""  )

    parser.add_option("-p", "--prefix", dest="prefix", type="string",
                      help=""  )

    parser.add_option("-a", "--output-pattern", dest="filename_output_pattern", type="string",
                      help=""  )

    parser.add_option("-f", "--format", dest="format", type="string",
                      help=""  )

    parser.add_option("-i", "--input-format", dest="input_format", type="string",
                      help=""  )

    parser.add_option("-u", "--clusters", dest="filename_clusters", type="string",
                      help=""  )

    parser.add_option( "--filename-previous", dest="filename_previous", type="string",
                      help=""  )

    parser.add_option("-m", "--max-margin", dest="max_margin", type="int",
                      help=""  )

    parser.add_option("-n", "--min-margin", dest="min_margin", type="int",
                      help=""  )

    parser.add_option("-d", "--default-margin", dest="default_margin", type="int",
                      help=""  )

    parser.add_option("-r", "--max-region", dest="max_region_nr", type="int",
                      help=""  )

    parser.add_option("-c", "--chunk", dest="chunk_size", type="int",
                      help=""  )

    parser.add_option("-k", "--offset-key", dest="offset_key", action="store_true",
                      help=""  )

    parser.add_option("-t", "--conserve-strand", dest="conserve_strand", action="store_true",
                      help=""  )

    parser.add_option("-o", "--forward-coordinates", dest="forward_coordinates", action="store_true",
                      help=""  )

    parser.add_option( "--no-sequence", dest="no_sequence", action="store_true",
                      help=""  )

    parser.add_option( "--combine-exons", dest="combine_exons", action="store_true",
                      help=""  )

    parser.set_defaults( 
        ## pattern for genomes, %s is substituted for the sbjct_token
        genome_file = "genome_%s.fasta",
        filename_peptides = None,
        ## margin to add to genomic segments
        max_margin = 0,
        min_margin = 0,
        default_margin = 0,
        offset_key = 0,
        chunk_size = 100,
        report_step = 1000,
        ## wheher to combine exons
        combine_exons = False,
        ## output format
        format = "single_fasta",
        ## prefix/suffix for output files
        suffix = ".fasta",
        prefix = "",
        filename_clusters = None,
        output = None,
        ## conserve strand
        conserve_strand = None,
        ## input format
        input_format = None,
        forward_coordinates = None,
        ## maximum number of predictions per region (0=all)
        max_region_nr = 0,
        filename_output_pattern = None,
        ## do not write sequences into output
        no_sequence = None,
        ## filename with previous results
        filename_previous = None, )


    (options, args) = E.Start( parser )
            
    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(1)

    if not options.filename_output_pattern:
        options.filename_output_pattern = options.prefix + "%i" + options.suffix

    # read peptide sequences
    if options.filename_peptides:
        peptide_sequences = Genomics.ReadPeptideSequences( open(options.filename_peptides, "r") )
    else:
        peptide_sequences = {}

    if options.loglevel >= 1:
        print "# read %i peptide sequences." % len(peptide_sequences)
        sys.stdout.flush()
        
    # read clustering information
    if options.filename_clusters:
        ## Note: if there are no alternative transcripts, map_rep2mem and map_mem2rep will be empty.
        ## thus add some dummy variables so that filtering will work.
        map_rep2mem, map_mem2rep = Genomics.ReadMap( open(options.filename_clusters, "r"))
        map_rep2mem['dummy'] = ["dummy",]
        map_mem2rep['dummy'] = "dummy"        
    else:
        map_rep2mem, map_mem2rep = {}, {}

    if options.loglevel >= 1:
        print "# read members: mem2rep=%i, rep2mem=%i" % (len(map_mem2rep), len(map_rep2mem))
        sys.stdout.flush()

    map_previous = {}
    # read previous data
    if options.filename_previous:
        
        entry = PredictionParser.PredictionParserEntry()


        infile = open(options.filename_previous, "r")
        for line in infile:
            if line[0] == "#": continue
            
            if options.input_format == "graph":
                data = line.split("\t")
                (region_id, region_nr, region_max_nr, sbjct_token, sbjct_strand, region_from, region_to,
                 query_token, weight) = data[:9]

                entry.Read( "\t".join(data[9:]))
                
                key =  "%s_vs_%s_%s" % (query_token, sbjct_token, sbjct_strand )

                if key not in map_previous:
                    map_previous[key] = [ (entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo ) ]
                else:
                    map_previous[key].append((entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo ))

        if options.loglevel >= 1:
            print "# read %i old entries." % (len(map_previous))
            sys.stdout.flush()
            
    ## variables for file numbering
    global_nchunks = 0
    global_chunk_size = options.chunk_size
    global_outfile = None

    ## counters of pairs/regions
    npairs = 0
    nregions = 0
    nskipped = 0
    
    region_id = None
    region_nr = None
    region_max_nr = None
    last_region_id = None

    last_margin_sbjct_from, last_margin_sbjct_to = None, None

    segments = []
    map_query2segments = {}

    entry = PredictionParser.PredictionParserEntry()
        
    for line in sys.stdin:
        
        if line[0] == "#": continue
    
        try:

            if options.input_format == "minimal":
                (entry.mQueryToken, entry.mSbjctToken, entry.mSbjctStrand,
                 entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo) = line[:-1].split("\t")
                entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo = int(entry.mSbjctGenomeFrom), int(entry.mSbjctGenomeTo)
            elif options.input_format == "ensembl":
                (dummy,
                 entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo,             
                 entry.mSbjctStrand, entry.mSbjctToken, 
                 entry.mQueryToken ) = line[:-1].split("\t")
                if entry.mSbjctStrand == "1":
                    entry.mSbjctStrand = "+"
                else:
                    entry.mSbjctStrand = "-"
                entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo = int(entry.mSbjctGenomeFrom), int(entry.mSbjctGenomeTo)
            elif options.input_format == "graph":
                data = line.split("\t")
                (region_id, region_nr, region_max_nr, sbjct_token, sbjct_strand, region_from, region_to,
                 query_token, weight) = data[:9]
                
                entry.Read( "\t".join( data[9:]) )
                
                if map_previous:
                    key =  "%s_vs_%s_%s" % (query_token,
                                            sbjct_token, sbjct_strand)

                    if key in map_previous:
                        found = False
                        ## check for overlap
                        for a, b in map_previous[key]:
                            if min(b, entry.mSbjctGenomeTo) - max(entry.mSbjctGenomeFrom, a) > 0:
                                found = True
                                break
                        if found:
                            nskipped += 1
                            continue

                region_nr, region_max_nr = map(int, (region_nr, region_max_nr))
                
                if last_region_id != region_id:
                    nregions += 1
                    last_region_id = region_id
                
                if options.max_region_nr:
                    region_max_nr = min(region_max_nr, options.max_region_nr)
                    if region_nr > options.max_region_nr:
                        continue
            elif options.input_format == "exons":
                (entry.mQueryToken, entry.mSbjctToken, entry.mSbjctStrand, phase, entry.mRank,
                 peptide_from, peptide_to, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo) = line[:-1].split("\t")
                entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo, entry.mRank = map( int, (entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo, entry.mRank))
            else:
                entry.Read( line )

            if entry.mSbjctStrand == "1":
                entry.mSbjctStrand = "+"
            if entry.mSbjctStrand == "-1":
                entry.mSbjctStrand = "-"

                
        except ValueError, IndexError:
            print "# Parsing error line: %s" % line[:-1]
            continue

        ## increase margin with minimal range
        if options.min_margin:
            min_sbjct_from = max(0, entry.mSbjctGenomeFrom - options.min_margin )
            min_sbjct_to = entry.mSbjctGenomeTo + options.min_margin
        else:
            min_sbjct_from = entry.mSbjctGenomeFrom
            min_sbjct_to = entry.mSbjctGenomeTo

        margin_sbjct_from = min_sbjct_from
        margin_sbjct_to   = min_sbjct_to
        
        ## increase margin around putative gene region
        if options.default_margin >= 0:
            margin_sbjct_from = max(0, min_sbjct_from - options.default_margin )
            margin_sbjct_to = min_sbjct_to + options.default_margin
        else:
            if entry.mQueryFrom > 0:
                margin_sbjct_from = max(0, min_sbjct_from - options.max_margin )
                
            if entry.mQueryTo < entry.mQueryLength:
                margin_sbjct_to = min_sbjct_to + options.max_margin

        segments.append( [region_id, region_nr, region_max_nr,
                          min_sbjct_from, min_sbjct_to,
                          margin_sbjct_from, margin_sbjct_to,
                          entry.mQueryToken, entry.mSbjctToken, entry.mSbjctStrand] )

        if entry.mQueryToken not in map_query2segments:
            map_query2segments[entry.mQueryToken] = []

        map_query2segments[entry.mQueryToken].append( [entry.mSbjctToken, entry.mSbjctStrand, margin_sbjct_from, margin_sbjct_to, len(segments)-1] )
Exemplo n.º 4
0
    else:
        # array with final predictions
        old_predictions = []

    if param_loglevel >= 1:
        print "# reading predictions."
        sys.stdout.flush()

    nread = 0
    ninput = 0
    for line in sys.stdin:

        if line[0] == "#":
            continue

        entry = PredictionParser.PredictionParserEntry(expand=0)
        entry.Read(line)
        nread += 1

        # set prediction id
        if not entry.mPredictionId:
            entry.mPredictionId = nread

        # filter bad predictions right here in order to save memory:
        if entry.score < param_min_total_score:
            if param_loglevel >= 2:
                print "# PRUNING: reason: score below minimum: removing: %s" % str(
                    entry)
            continue
        elif entry.mQueryCoverage < param_min_coverage_query:
            if param_loglevel >= 2:
Exemplo n.º 5
0
        for k in exons.keys():
            ee = exons[k]

            id = 0
            for e in ee:
                id += 1
                print "\t".join(
                    map(str, (e.mQueryToken, id, e.mPeptideFrom, e.mPeptideTo,
                              e.frame, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                              e.mGenomeFrom, e.mGenomeTo)))

    elif options.output_format == "exons":

        if options.format == "exons":
            parser = PredictionParser.PredictionParserExons(
                contig_sizes=contig_sizes)
        else:
            raise "unknown format %s." % options.format

        results = parser.Parse(sys.stdin.readlines())
        id = 0
        for entry in results:
            exons = Exons.Alignment2Exons(
                entry.mMapPeptide2Genome,
                entry.mQueryFrom,
                entry.mSbjctGenomeFrom,
            )

            for e in exons:
                id += 1
                print "\t".join(
Exemplo n.º 6
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/regions2graph.py 2754 2009-09-04 16:50:22Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-b",
                      "--benchmark",
                      dest="filename_benchmark",
                      type="string",
                      help="")

    parser.add_option("-y",
                      "--benchmark-synonyms",
                      dest="benchmark_synonyms",
                      type="string",
                      help="")

    parser.add_option("-p",
                      "--peptides",
                      dest="filename_peptides",
                      type="string",
                      help="")

    parser.add_option("-c",
                      "--min-coverage-query",
                      dest="min_coverage_query",
                      type="float",
                      help="")

    parser.add_option("-s",
                      "--min-score",
                      dest="min_total_score",
                      type="float",
                      help="")

    parser.add_option("-i",
                      "--min-percent-identity",
                      dest="min_percent_identity",
                      type="float",
                      help="")

    parser.add_option("-o",
                      "--max-percent-overlap",
                      dest="max_percent_overlap",
                      type="float",
                      help="")

    parser.add_option("--overlap-min-score",
                      dest="overlap_min_score",
                      type="float",
                      help="")

    parser.add_option("--overlap-min-coverage",
                      dest="overlap_min_coverage",
                      type="float",
                      help="")

    parser.add_option("--overlap-min-identity",
                      dest="overlap_min_identity",
                      type="float",
                      help="")

    parser.add_option("--overlap-max-coverage",
                      dest="overlap_max_coverage",
                      type="float",
                      help="")

    parser.add_option("-m",
                      "--max-matches",
                      dest="max_matches",
                      type="int",
                      help="")

    parser.add_option("-j",
                      "--join-regions",
                      dest="join_regions",
                      type="int",
                      help="")

    parser.add_option("--join-regions-max-regions",
                      dest="join_regions_max_regions",
                      type="int",
                      help="")

    parser.add_option("--join-regions-max-coverage",
                      dest="join_regions_max_coverage",
                      type="float",
                      help="")

    parser.add_option("--min-length", dest="min_length", type="int", help="")

    parser.add_option("--test", dest="test", type="int", help="")

    parser.add_option("--filter-queries",
                      dest="filename_filter_queries",
                      type="string",
                      help="")

    parser.add_option("--filter-regions",
                      dest="filter_regions",
                      type="string",
                      help="")

    parser.add_option("--conserve-memory",
                      dest="conserve_memory",
                      action="store_true",
                      help="")

    parser.add_option("--filter-suboptimal",
                      dest="filter_suboptimal",
                      action="store_true",
                      help="")

    parser.set_defaults(
        ## overlap allowed for matches on genomic region
        max_percent_overlap=20,
        gop=-10.0,
        gep=-2.0,
        ## thresholds for joining regions
        overlap_min_score=80,
        overlap_min_coverage=80,
        overlap_max_coverage=90,
        overlap_min_identity=50,
        ## threshold for filtering bad predictions:
        ## minimum score
        min_total_score=80,
        ## joining regions
        join_regions=0,
        ## maximum coverage of query for predictions to be joined
        ## (This is to ensure not to join duplications. A range check
        ## would be better, but runs into trouble with repeats).
        join_regions_max_coverage=90,
        ## minimum coverage of query
        min_coverage_query=10,
        ## conserve memory
        conserve_memory=0,
        ## minimum percent identity
        min_percent_identity=0,
        ## minimum length
        min_length=0,
        max_matches=0,
        filename_peptides=None,
        filename_filter_queries=None,
        ## turn on/off various filters
        filter_suboptimal=False,
        filter_regions=False,
        ## parameters for filter of suboptimal predictions
        min_relative_coverage=0.5,
        min_relative_score=0.5,
        min_relative_percent_identity=0.5,
        ## minimum difference between non-correlated conflicts to keep them both.
        conflicts_min_difference=0.1,
        ## benchmarking data
        benchmarks=None,
        benchmark_synonyms=None,
        filename_benchmark=None,
        filename_benchmark_synonyms=None,
        test=None,
        max_intron=50000)

    (options, args) = E.Start(parser, add_pipe_options=True)

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(2)

    #####################################################################################
    # read filtering
    filter_queries = {}
    if options.filename_filter_queries:
        for line in open(options.filename_filter_queries, "r"):
            if line[0] == "#": continue
            query_token = line[:-1].split("\t")[0]
            filter_queries[query_token] = True

    if options.loglevel >= 1:
        options.stdlog.write("# filtering for %i queries.\n" %
                             len(filter_queries))

    #####################################################################################
    # read benchmarking regions
    if options.filename_benchmark:
        options.benchmarks = ReadBenchmarkingRegions(
            open(options.filename_benchmark, "r"))
        if options.loglevel >= 1:
            options.stdlog.write(
                "# read benchmarking regions for %i tokens\n" %
                len(options.benchmarks))
            sys.stdout.flush()
        if options.filename_benchmark_synonyms:
            infile = open(options.filename_benchmark_synonyms, "r")
            options.benchmark_synonyms = {}
            for line in infile:
                if line[0] == "#": continue
                value, key = line[:-1].split("\t")
                options.benchmark_synonyms[key] = value
        else:
            options.benchmark_synonyms = {}
    else:
        options.benchmarks = {}
        options.benchmark_synonyms = {}

    #####################################################################################
    # read peptide sequences
    if options.filename_peptides:
        peptide_sequences = Genomics.ReadPeptideSequences(
            open(options.filename_peptides, "r"))
    else:
        peptide_sequences = {}

    if options.conserve_memory:
        old_predictions, filename_old_predictions = tempfile.mkstemp()
        os.close(old_predictions)
        old_predictions = PredictionFile.PredictionFile()
        old_predictions.open(filename_old_predictions, "w")
    else:
        ## array with final predictions
        old_predictions = []

    if options.loglevel >= 1:
        options.stdlog.write("# reading predictions.\n")
        sys.stdout.flush()

    nread = 0
    ninput = 0
    for line in sys.stdin:

        if line[0] == "#": continue

        entry = PredictionParser.PredictionParserEntry(expand=0)
        entry.Read(line)
        nread += 1

        # set prediction id
        if not entry.mPredictionId: entry.mPredictionId = nread

        ## filter bad predictions right here in order to save memory:
        if entry.score < options.min_total_score:
            if options.loglevel >= 3:
                options.stdlog.write(
                    "# PRUNING: reason: score below minimum: removing: %s\n" %
                    str(entry))
            continue
        elif entry.mQueryCoverage < options.min_coverage_query:
            if options.loglevel >= 3:
                options.stdlog.write(
                    "# PRUNING: reason: coverage below minimum: removing: %s\n"
                    % str(entry))
            continue
        elif entry.mPercentIdentity < options.min_percent_identity:
            if options.loglevel >= 3:
                options.stdlog.write(
                    "# PRUNING: reason: percent identity below minimum: removing: %s\n"
                    % str(entry))
            continue
        elif entry.mSbjctTo - entry.mSbjctFrom < options.min_length:
            if options.loglevel >= 3:
                options.stdlog.write(
                    "# PRUNING: reason: length of transcript below minimum: removing: %s\n"
                    % str(entry))
            continue

        ninput += 1

        if options.test and ninput > options.test:
            break

        old_predictions.append(entry)

    if options.loglevel >= 1:
        options.stdlog.write("# predictions after input: %i\n" % ninput)
        sys.stdout.flush()

    if options.loglevel >= 10:

        options.stdlog.write(
            "############## start: predictions after input ###################################\n"
        )
        for x in old_predictions:
            options.stdlog.write("# %s\n" % str(x))
        options.stdlog.write(
            "############## end: predictions after input #####################################\n"
        )
        sys.stdout.flush()

    if ninput == 0:
        options.stdlog.write("# ERROR: no predictions\n")
        sys.exit(1)

    #####################################################################################
    ## set up stacks of regions
    if options.conserve_memory:
        old_predictions.close()
        old_predictions.open(mode="r")
        removed_predictions, filename_removed_predictions = tempfile.mkstemp()
        os.close(removed_predictions)
        removed_predictions = PredictionFile.PredictionFile()
        removed_predictions.open(filename_removed_predictions, "w")

        new_predictions, filename_new_predictions = tempfile.mkstemp()
        os.close(new_predictions)
        new_predictions = PredictionFile.PredictionFile()
        new_predictions.open(filename_new_predictions, "w")
    else:
        removed_predictions = []
        new_predictions = []

    if options.benchmarks:
        EvaluateBenchmark(old_predictions)

    #####################################################################################
    ## join regions
    if options.join_regions and options.join_regions_max_coverage:
        if options.loglevel >= 1:
            options.stdlog.write(
                "# joining regions: maximum distance between segments = %i and maximum query coverage = %i\n"
                % (options.join_regions, options.join_regions_max_coverage))
            sys.stdout.flush()
        njoined = JoinRegions(old_predictions, new_predictions)
        if options.conserve_memory:
            ExchangeStreams(old_predictions, new_predictions)
        else:
            old_predictions = new_predictions
            new_predictions = []

        if options.loglevel >= 1:
            options.stdlog.write("# predictions after joining: %i\n" % njoined)
            sys.stdout.flush()

        if options.loglevel >= 10:
            options.stdlog.write(
                "############## start: predictions after joining ###################################\n"
            )
            for x in old_predictions:
                options.stdlog.write("# %s" % str(x))
            options.stdlog.write(
                "############## end: predictions after joining #####################################\n"
            )
            sys.stdout.flush()
    else:
        if options.loglevel >= 1:
            options.stdlog.write("# joining regions: skipped\n")
            sys.stdout.flush()

        njoined = ninput

    ##################################################################################################
    ## build map of best predictions
    if options.filter_suboptimal:
        if options.loglevel >= 1:
            options.stdlog.write("# calculating best predictions\n")
            sys.stdout.flush()
        best_predictions = GetBestPredictions(old_predictions)
    else:
        best_predictions = {}

    if options.loglevel >= 1:
        options.stdlog.write("# calculated best predictions: %i\n" %
                             len(best_predictions))
        sys.stdout.flush()

    ##################################################################################################
    ## get regions to eliminate
    filter_regions = {}
    if options.filter_regions:

        entry = PredictionParser.PredictionParserEntry(expand=0)

        filenames = options.filter_regions.split(",")

        for filename in filenames:
            if options.loglevel >= 1:
                options.stdlog.write("# reading regions to filter from %s.\n" %
                                     (filename))
                sys.stdout.flush()

            if filename.endswith(".gz"):
                infile = gzip.open(filename, "r")
            else:
                infile = open(filename, "r")

            for line in infile:

                if line[0] == "#": continue

                entry.Read(line)

                exons = Exons.Alignment2Exons(
                    Genomics.String2Alignment(entry.mAlignmentString),
                    entry.mQueryFrom, entry.mSbjctGenomeFrom)

                key = "%s-%s" % (entry.mSbjctToken, entry.mSbjctStrand)

                if key not in filter_regions: filter_regions[key] = []

                for exon in exons:
                    filter_regions[key].append(
                        (exon.mGenomeFrom, exon.mGenomeTo))

            infile.close()

        for k in filter_regions.keys():
            filter_regions[k].sort()

    ##################################################################################################
    ## bipartite graph construction

    ##################################################################################################
    ## sort predictions by genomic region
    if options.conserve_memory:
        old_predictions.sort(('mSbjctToken', 'mSbjctStrand',
                              'mSbjctGenomeFrom', 'mSbjctGenomeTo'))
    else:
        old_predictions.sort(lambda x, y: cmp(
            (x.mSbjctToken, x.mSbjctStrand, x.mSbjctGenomeFrom, x.
             mSbjctGenomeTo), (y.mSbjctToken, y.mSbjctStrand, y.
                               mSbjctGenomeFrom, y.mSbjctGenomeTo)))

    ##################################################################################################
    ## filter predictions and resolve conflicts based on genomic overlap
    ## deleted segments are put in a temporary storage space.
    min_from, max_from = None, None
    min_to, max_to = None, None
    region_id = 0
    noverlaps = 0
    last_prediction = None
    predictions = []
    region = Region()
    nclusters = 0
    neliminated_suboptimal = 0
    neliminated_overlap = 0

    noutput, nfiltered = 0, 0

    for this_prediction in old_predictions:

        ## Filter 1: skip suboptimal predictions
        if this_prediction.mQueryToken in best_predictions:

            best_prediction = best_predictions[this_prediction.mQueryToken]

            neliminated_suboptimal += 1
            if float(
                    this_prediction.mQueryCoverage
            ) / best_prediction.mQueryCoverage < options.min_relative_coverage:
                if options.loglevel >= 2:
                    options.stdlog.write(
                        "# PRUNING: reason: coverage below best: removing %s\n"
                        % str(this_prediction))
                continue

            if float(this_prediction.score
                     ) / best_prediction.score < options.min_relative_score:
                if options.loglevel >= 2:
                    options.stdlog.write(
                        "# PRUNING: reason: score below best: removing %s\n" %
                        str(this_prediction))
                continue

            if float(
                    this_prediction.mPercentIdentity
            ) / best_prediction.mPercentIdentity < options.min_relative_percent_identity:
                if options.loglevel >= 2:
                    options.stdlog.write(
                        "# PRUNING: reason: percent identity below best: removing %s\n"
                        % str(this_prediction))
                continue

            neliminated_suboptimal -= 1

        ## Filter 2: remove predictions overlapping with certain segments
        key = "%s-%s" % (this_prediction.mSbjctToken,
                         this_prediction.mSbjctStrand)

        if key in filter_regions:

            exons = Exons.Alignment2Exons(
                Genomics.String2Alignment(this_prediction.mAlignmentString),
                this_prediction.mQueryFrom, this_prediction.mSbjctGenomeFrom)

            if CheckOverlap(map(lambda x: (x.mGenomeFrom, x.mGenomeTo), exons),
                            filter_regions[key]):
                if options.loglevel >= 2:
                    options.stdlog.write(
                        "# PRUNING: reason: overlapping with taboo region: removing %s\n"
                        % str(this_prediction))
                neliminated_overlap += 1
                continue

        try:
            this_query_peptide, this_query_status, this_query_gene, this_query_transcript = \
                                re.split("\s+", this_prediction.mQueryToken)
        except ValueError:
            this_query_gene = None

        ## process first entry
        if min_from == None:
            min_from = this_prediction.mSbjctGenomeFrom
            max_from = this_prediction.mSbjctGenomeFrom
            max_to = this_prediction.mSbjctGenomeTo
            min_to = this_prediction.mSbjctGenomeTo
            predictions.append(this_prediction)
            last_prediction = this_prediction
            continue

        overlap = min_to > this_prediction.mSbjctGenomeFrom and \
                  last_prediction.mSbjctToken == this_prediction.mSbjctToken and \
                  last_prediction.mSbjctStrand == this_prediction.mSbjctStrand

        if options.loglevel >= 4:
            options.stdlog.write("# from=%i, to=%i, working on: %s\n" %
                                 (min_from, max_to, str(this_prediction)))
            options.stdlog.flush()

        # resolve overlap between different genes
        if overlap:
            noverlaps += 1
        else:
            region.mSbjctToken = last_prediction.mSbjctToken
            region.mSbjctStrand = last_prediction.mSbjctStrand
            region.mSbjctGenomeFrom = min_from
            region.mSbjctGenomeTo = max_to

            region_id, nxoutput, nxfiltered = ProcessRegion(
                predictions, region_id, region, peptide_sequences,
                filter_queries)

            noutput += nxoutput
            nfiltered += nxfiltered
            nclusters += 1
            predictions = []
            min_from = this_prediction.mSbjctGenomeFrom
            max_from = this_prediction.mSbjctGenomeFrom
            min_to = this_prediction.mSbjctGenomeTo
            max_to = this_prediction.mSbjctGenomeTo

        predictions.append(this_prediction)

        min_from = min(min_from, this_prediction.mSbjctGenomeFrom)
        max_from = max(max_from, this_prediction.mSbjctGenomeFrom)
        min_to = min(min_to, this_prediction.mSbjctGenomeTo)
        max_to = max(max_to, this_prediction.mSbjctGenomeTo)

        last_prediction = this_prediction

    if last_prediction:
        region.mSbjctToken = last_prediction.mSbjctToken
        region.mSbjctStrand = last_prediction.mSbjctStrand
        region.mSbjctGenomeFrom = min_from
        region.mSbjctGenomeTo = max_to

        region_id, nxoutput, nxfiltered = ProcessRegion(
            predictions, region_id, region, peptide_sequences, filter_queries)
        noutput += nxoutput
        nfiltered += nxfiltered

        nclusters += 1

    if options.conserve_memory:
        os.remove(filename_old_predictions)
        os.remove(filename_new_predictions)
        os.remove(filename_removed_predictions)

    if options.loglevel >= 1:
        options.stdlog.write( "# pairs: nread=%i, input=%i, joined=%i, clusters=%i, regions=%i, eliminated_subopt=%i, eliminated_overlap=%i, noutput=%i, nfiltered=%i\n" % \
              (nread, ninput, njoined, nclusters, region_id, neliminated_suboptimal, neliminated_overlap, noutput, nfiltered ))

    E.Stop()
Exemplo n.º 7
0
 def __init__(self):
     Predictor.__init__(self)
     self.mParser = PredictionParser.PredictionParserGenewise()
     self.mExecutable = "genewise"
     self.mOptions = "-pseudo -init endbias"
     self.mOutputOptions = "-quiet -sum -gff -trans -pep -alb"
Exemplo n.º 8
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: gpipe/predictions2transcripts.py 1841 2008-05-08 12:07:13Z andreas $",
                                    usage = globals()["__doc__"] )
    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome."  )

    parser.add_option("-o", "--output-filename-summary", dest="output_filename_summary", type="string",
                      help="filename with summary information."  )

    parser.add_option( "--skip-header", dest="skip_header", action="store_true",
                       help="skip header."  )

    parser.add_option( "--start-codon-boundary", dest="start_codon_boundary", type="int",
                      help="maximum extension for start codon (make divisible by 3)."  )
    
    parser.add_option( "--stop-codon-boundary", dest="stop_codon_boundary", type="int",
                      help="maximum extension for stop codon (make divisible by 3)."  )

    parser.add_option( "--left-extension-mode", dest="left_extension_mode", type="choice",
                       choices=("first-start", "first-stop-backtrack"),
                       help="extension mode for 5' end.")

    parser.add_option( "--fill-introns", dest="fill_introns", type="int",
                      help="fill intron if divisible by three and no stop codon up to a maximum length of #."  )

    parser.add_option( "--introns-max-stops", dest="introns_max_stops", type="int",
                      help="maximum number of stop codons to tolerate within an intron."  )

    parser.add_option( "--output-format", dest="output_format", type="choice",
                       choices=("predictions", "extensions", "filled-introns"),
                      help="output format."  )
    
    parser.set_defaults(
        genome_file = "genome",
        start_codons = ("ATG"),
        stop_codons = ("TAG", "TAA", "TGA"),
        start_codon_boundary = 9999,
        stop_codon_boundary  = 9999,
        fill_introns = 0,
        introns_max_stops = 0,
        left_splice_signals = ("GT",),
        right_splice_signals = ("AG",),
        output_format="extensions",
        left_extension_mode = "first-start",
        skip_header = False,
        output_filename_summary = None,
        )

    (options, args) = E.Start( parser, add_pipe_options = True )

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(2)

    options.start_codon_boundary = int(options.start_codon_boundary / 3)
    options.stop_codon_boundary = int(options.stop_codon_boundary / 3)

    fasta = IndexedFasta.IndexedFasta( options.genome_file )
    
    p = PredictionParser.PredictionParserEntry()

    ninput, noutput = 0, 0
    nfilled = 0
    nseqs_filled = 0
    nseqs_extended = 0
    left_extensions = []
    right_extensions = []
    filled_introns = []

    if not options.skip_header:
        if options.output_format == "predictions":
            options.stdout.write( Prediction.Prediction().getHeader() + "\n" )
        elif options.output_format == "filled-introns":
            options.stdout.write("\t".join( ("prediction_id",
                                             "intron",
                                             "peptide_sequence",
                                             "genomic_sequence") ) + "\n" )

    if options.output_filename_summary:
        outfile_summary = open (options.output_filename_summary, "w" )
        outfile_summary.write( "id\ttype\tnumber\tlength\tfrom\tto\tsequence\n" )
    else:
        outfile_summary = None

    for line in options.stdin:
        
        if line[0] == "#": continue

        ninput += 1
        p.Read(line)

        lsequence = fasta.getLength( p.mSbjctToken )

        genome_from = max( 0, p.mSbjctGenomeFrom - options.start_codon_boundary)
        genome_to = min( lsequence, p.mSbjctGenomeTo + options.stop_codon_boundary)
        
        genomic_sequence = fasta.getSequence( p.mSbjctToken, p.mSbjctStrand,
                                              genome_from,
                                              genome_to ).upper()

        ########################################################################
        ########################################################################
        ########################################################################            
        ## Do extensions
        
        if options.start_codon_boundary or options.stop_codon_boundary:
            
            extension_start = p.mSbjctGenomeFrom - genome_from 
            extension_stop  = genome_to - p.mSbjctGenomeTo
            
            fragment_to = extension_start + p.mSbjctGenomeTo - p.mSbjctGenomeFrom

            lfragment = len(genomic_sequence)

            ########################################################################
            ########################################################################
            ########################################################################            
            ## find start codon
            start = extension_start
            found_start = False
            if options.left_extension_mode == "first-start":

                found_start, start = findCodonReverse( genomic_sequence,
                                                       start,
                                                       options.start_codons,
                                                       options.stop_codons )
                
            elif options.left_extension_mode == "first-stop-backtrack":

                if genomic_sequence[start:start+3] in options.start_codons:
                    found_start = True
                else:
                    found_start, start = findCodonReverse( genomic_sequence,
                                                           start,
                                                           options.stop_codons )
                    
                    if found_start:
                        E.info("prediction %s: stop found at %i (%i) backtracking ..." % ( p.mPredictionId, start, extension_start - start) )
                        
                        ## bracktrack to first start codon
                        found_start = False
                        while start < extension_start:
                            start += 3
                            if genomic_sequence[start:start+3] in options.start_codons:
                                found_start = True
                                break
                        else:
                            start = extension_start

                        if found_start:
                            E.info("start codon found at %i (%i)." % ( start, extension_start - start) )
                        else:
                            E.info("no start codon found." )
                    else:
                        E.info("prediction %s: no stop found ... backtracking to start codon." % ( p.mPredictionId ) )

                        found_start, start = findCodonReverse( genomic_sequence, start, options.start_codons )

                        E.info("prediction %s: no start codon found." % ( p.mPredictionId ) )

            if found_start:
                start += genome_from
            else:
                start = p.mSbjctGenomeFrom

            dstart = p.mSbjctGenomeFrom - start
            
            ########################################################################
            ########################################################################
            ########################################################################            
            ## find stop codon
            ## stop points to the beginning of the codon, thus the stop codon will
            ## not be part of the sequence.
            stop = fragment_to
            found_stop = 0
            while stop < lfragment and \
                      genomic_sequence[stop:stop+3] not in ("NNN", "XXX"):
                if genomic_sequence[stop:stop+3] in options.stop_codons:
                    found_stop = 1
                    break

                stop += 3

            if found_stop:
                stop += genome_from 
            else:
                stop = p.mSbjctGenomeTo

            dstop = stop - p.mSbjctGenomeTo 

            ########################################################################
            ########################################################################
            ########################################################################            
            ## build new prediction
            map_peptide2genome = []
            if dstart: map_peptide2genome.append( ("G", 0, dstart) )
            map_peptide2genome += p.mMapPeptide2Genome
            if dstop: map_peptide2genome.append( ("G", 0, dstop) )

            E.info("prediction %s: extension: found_start=%i, found_stop=%i, left=%i, right=%i" % ( p.mPredictionId, found_start, found_stop, dstart, dstop ) )

            ## save results
            p.mMapPeptide2Genome = map_peptide2genome
            p.mAlignmentString = Genomics.Alignment2String( map_peptide2genome )
            p.mSbjctGenomeFrom -= dstart
            p.mSbjctGenomeTo += dstop
            p.mSbjctFrom += dstart / 3
            p.mSbjctTo += dstart / 3 + dstop / 3            
            
            if dstart or dstop:
                if dstart: left_extensions.append( dstart )
                if dstop: right_extensions.append( dstop )
                
                nseqs_extended += 1

        ## update genomic sequence because borders might have changed.
        genomic_sequence = fasta.getSequence( p.mSbjctToken,
                                              p.mSbjctStrand,
                                              p.mSbjctGenomeFrom,
                                              p.mSbjctGenomeTo ).upper()

        if options.fill_introns:
            
            has_filled = False

            exons = Exons.Alignment2Exons( p.mMapPeptide2Genome,
                                           query_from = 0,
                                           sbjct_from = 0 )

            new_exons = []

            last_e = exons[0]

            nintron = 0

            for e in exons[1:]:

                nintron += 1
                lintron = e.mGenomeFrom - last_e.mGenomeTo
                
                if lintron > options.fill_introns or (lintron) % 3 != 0:
                    E.debug( "prediction %s: intron %i of size %i discarded." % \
                                 (p.mPredictionId,
                                  nintron, lintron ) )
                    
                    new_exons.append(last_e)
                    last_e = e
                    continue

                ## get sequence, include also residues from split codons
                ## when checking for stop codons.
                if e.mAlignment[0][0] == "S":
                    offset_left = last_e.mAlignment[-1][2]
                    offset_right = e.mAlignment[0][2]
                else:
                    offset_left, offset_right = 0, 0
                    
                sequence = genomic_sequence[last_e.mGenomeTo - offset_left:e.mGenomeFrom+offset_right]
                
                ## check for splice sites
                for signal in options.left_splice_signals:
                    if sequence[offset_left:offset_left+len(signal)] == signal:
                        left_signal = True
                        break
                else:
                    left_signal = False
                    
                for signal in options.right_splice_signals:
                    if sequence[-(len(signal)+offset_right):-offset_right] == signal:
                        right_signal = True
                        break
                else:
                    right_signal = False

                nstops, ngaps = 0, 0
                for codon in [ sequence[x:x+3] for x in range(0,len(sequence),3) ]:
                    if codon in options.stop_codons: nstops += 1
                    if "N" in codon.upper(): ngaps += 1
                        
                    E.debug( "prediction %s: intron %i of size %i (%i-%i) (%s:%s:%i:%i): stops=%i, gaps=%i, signals=%s,%s." % \
                                 (p.mPredictionId,
                                  nintron, lintron,
                                  offset_left, offset_right,
                                  p.mSbjctToken, p.mSbjctStrand,
                                  p.mSbjctGenomeFrom + last_e.mGenomeTo,
                                  p.mSbjctGenomeFrom + e.mGenomeFrom,
                                  nstops,
                                  ngaps,
                                  left_signal, right_signal ) )

                if nstops + ngaps > options.introns_max_stops:
                    new_exons.append(last_e)                                        
                    last_e = e
                    continue
                
                E.info( "prediction %s: filling intron %i of size %i: stops=%i, gaps=%i, signals=%s,%s" % \
                            (p.mPredictionId,
                             nintron, lintron,
                             nstops,
                             ngaps,
                             left_signal, right_signal))

                e.Merge( last_e )
                has_filled = True
                nfilled += 1
                last_e = e

                if options.output_format == "filled-introns":
                    options.stdout.write( "\t".join( map(str, ( p.mPredictionId,
                                                                nintron,
                                                                Genomics.TranslateDNA2Protein( sequence ),
                                                                sequence ) ) ) + "\n" )
                                                                
                
                filled_introns.append(lintron)
                p.mNIntrons -= 1
                
            new_exons.append(last_e)

            if has_filled: nseqs_filled += 1

            Exons.UpdatePeptideCoordinates( new_exons )
            
            p.mMapPeptide2Genome = Exons.Exons2Alignment( new_exons )
            p.mAlignmentString = Genomics.Alignment2String( p.mMapPeptide2Genome )

        ## build translated sequence
        p.mMapPeptide2Translation, p.mTranslation = Genomics.Alignment2PeptideAlignment( \
               p.mMapPeptide2Genome, p.mQueryFrom, 0, genomic_sequence )

        ## output info
        if options.output_format == "predictions":
            options.stdout.write( str(p) + "\n" )
        elif options.output_format == "extensions":
            if found_start: found_start = 1
            if found_stop: found_stop = 1
            options.stdout.write( "\t".join( map(str, ( p.mPredictionId,
                                                        found_start, found_stop, 
                                                        dstart, dstop,
                                                        p.mTranslation,
                                                        p.mSbjctGenomeFrom, p.mSbjctGenomeTo,
                                                        p.mAlignmentString ))) + "\n" )

        noutput += 1
        options.stdout.flush()

    E.info("stats  : %s" % "\t".join(Stats.DistributionalParameters().getHeaders() ))
    E.info("left   : %s" % str(Stats.DistributionalParameters(left_extensions)) )
    E.info("right  : %s" % str(Stats.DistributionalParameters(right_extensions)) )
    E.info("introns: %s" % str(Stats.DistributionalParameters(filled_introns)) )        
    E.info("ninput=%i, noutput=%i, nextended=%i, nfilled=%i, nexons_filled=%i" % (\
            ninput, noutput, nseqs_extended, nseqs_filled, nfilled))
        
    E.Stop()
Exemplo n.º 9
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser.add_option("-m", "--filename-map", dest="filename_map", type="string",
                      help="filename with mapping information.")
    parser.add_option("-o", "--pattern-old", dest="pattern_old", type="string",
                      help="pattern for mapping new to old identifiers: extract string from old.")
    parser.add_option("-n", "--pattern-new", dest="pattern_new", type="string",
                      help="pattern for mapping new to old identifiers: put string into new.")
    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="genome_file.")
    parser.add_option("-p", "--peptides", dest="filename_peptides", type = "string",
                      help="filename with peptide sequences.")
    parser.add_option("-f", "--input-format", dest="input_format", type="choice",
                      help="format of mapping file", choices=("alignment", "offsets") )
    parser.add_option("-i", "--write-missed", dest="write_missed", type="string",
                      help="write missed identifiers to separate file.")
    parser.add_option("-a", "--filename-genes", dest="filename_genes", type="string",
                      help="filename with gene information.")
    parser.add_option("--filename-old-peptides", dest="filename_old_peptides", type="string",
                      help="filename with old peptide information.")
    parser.add_option("--no-renumber", dest="renumber", action="store_false",
                      help="do not renumber predictions.")
    parser.add_option("--contig-sizes-old", dest="contig_sizes_old", type="string",
                      help="contig sizes for old data.")
    parser.add_option("--contig-sizes-new", dest="contig_sizes_new", type="string",
                      help="contig sizes for new data.")
    parser.add_option("--skip-errors", dest="skip_errors", action="store_true",
                      help="skip entries with errors.")
    
    parser.set_defaults(
        filename_map = None,
        pattern_old = "(.+)",
        pattern_new = "%s",
        genome_file = None,
        filename_peptides = None,
        write_missed = None,
        filename_genes = None,
        filename_old_peptides = None,
        renumber = True,
        input_format = "alignment",
        contig_sizes_old = None,
        contig_sizes_new = None,
        skip_errors = None
        )

    (options, args) = E.Start( parser, add_pipe_options = True)

    predictor = PredictorExonerate()

    ## the different mapping criteria
    map_sbjcts = {}
    breakpoints = {}

    ################################################################################################
    map_transcript2gene = {}
    if options.filename_genes:
        infile = open(options.filename_genes, "r")
        for gene, transcript in map( lambda x: x[:-1].split("\t")[:2], filter( lambda x: x[0] != "#", infile.readlines())):
            map_transcript2gene[transcript] = gene
        infile.close()

    ################################################################################################
    peptides = {}
    if options.filename_peptides:
        peptides = Genomics.ReadPeptideSequences( open(options.filename_peptides, "r"))
        options.stdlog.write( "# read %i peptide sequences.\n" % len(peptides))

    ################################################################################################
    ## read old query sequences and compare against new query sequences
    ## this can be used to build a map between old and new queries
    query_map_old2new = {}        
    if options.filename_old_peptides:
        old_peptides = Genomics.ReadPeptideSequences( open(options.filename_old_peptides, "r"))
        options.stdlog.write( "# read %i old peptide sequences.\n" % len(old_peptides))
        query_map_old2new, unmappable, unmapped = Genomics.MapSequences( old_peptides, peptides)
        options.stdlog.write( "# built map: unmappable=%i unmapped=%i.\n" % (len(unmappable), len(unmapped)))
        if options.loglevel >= 2:
            options.stdlog.write( "# unmappable: %s.\n" % ";".join(unmappable))
            options.stdlog.write( "# unmapped: %s.\n" % ";".join(unmapped))            

    ################################################################################################
    ## read old/new contig sizes for mapping positive/negative coordinates
    contig_sizes_old = {}
    contig_sizes_new = {}
    if options.contig_sizes_old:
        contig_sizes_old = Genomics.ReadContigSizes( open(options.contig_sizes_old, "r") )
    if options.contig_sizes_new:
        contig_sizes_new = Genomics.ReadContigSizes( open(options.contig_sizes_new, "r") )
        
    ################################################################################################        
    if options.filename_map:
        
        infile = open(options.filename_map)
        if options.input_format == "alignments":
            for line in infile:
                if line[0] == "#": continue

                x, old_token, old_from, old_to, old_ali, new_from, new_to, new_ali = line[:-1].split("\t")

                map_sbjcts[old_token] = (old_from, old_ali, new_from, new_ali)

            if options.loglevel >= 1:
                options.stdlog.write( "# read %i alignments.\n" % len(map_sbjcts))

        elif options.input_format == "offsets":
            ## input is a list of segments and their offsets.

            breakpoints, endpoints, offsets = ReadOffsets( infile )
            if options.loglevel >= 1:
                options.stdlog.write( "# read breakpoints for %i chromosomes.\n" % len(breakpoints))

        infile.close()
        
    ################################################################################################
    ################################################################################################
    ################################################################################################
    ## end of input section
    ################################################################################################
    ################################################################################################
    ################################################################################################        

    rx = re.compile(options.pattern_old)
    last_sbjct_token = None
    ninput = 0
    nerrors = 0
    nerrors_map = 0
    nerrors_inconsistencies = 0
    nerrors_boundaries = 0
    nerrors_translation = 0
    nerrors_inconsequential = 0
    nerrors_realigned = 0
    nmapped = 0
    nfiltered = 0
    naligned = 0
    noutput = 0
    found_transcripts = {}
    nduplicates = 0
    output = {}
    
    for line in sys.stdin:
        if line[0] == "#": continue
        
        entry = PredictionParser.PredictionParserEntry()

        entry.Read( line )
        
        ninput += 1
        is_positive = entry.mSbjctStrand == "+"
        
        is_error = False
        
        ## check if query token is mappable: using sequence map
        if (query_map_old2new and entry.mQueryToken not in query_map_old2new):
            options.stdlog.write("# skipping prediction %i: obsolete query %s\n" % (entry.mPredictionId, entry.mQueryToken) )
            nfiltered += 1
            continue
        else:
            ## check if query token is mappable: using filter        
            if (peptides and entry.mQueryToken not in peptides):
                options.stdlog.write("# skipping prediction %i: obsolete query %s\n" % (entry.mPredictionId, entry.mQueryToken) )
                nfiltered += 1
                continue

        new_sbjct_token = options.pattern_new % rx.search(entry.mSbjctToken).groups()[0]

        ##########################################################################################################
        ## Map via alignments
        if entry.mSbjctToken in map_sbjcts:
            nmapped += 1
            if last_sbjct_token != entry.mSbjctToken:
                old_from, old_ali, new_from, new_ali = map_sbjcts[entry.mSbjctToken]
                map_a2b = alignlib_lite.makeAlignmentVector()
                alignlib_lite.AlignmentFormatExplicit(
                    int(old_from), old_ali,
                    int(new_from), new_ali).copy( map_a2b )
                
            last_sbjct_token = entry.mSbjctToken
            
            if options.loglevel >= 3:
                print "#", str(entry)
                print "#", map_sbjcts[entry.mSbjctToken]
                sys.stdout.flush()

            old_f, old_t = entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo

            ## convert to forward coordinates:
            if is_positive:
                f, t= old_f, old_t
                first_res, last_res = f + 1, t                
            else:
                f, t = map_a2b.getRowTo() - old_f, map_a2b.getRowTo() - old_t 
                first_res, last_res = f, t + 1 
            
            ## map first and last residues
            mfirst_res = map_a2b.mapRowToCol( first_res )
            mlast_res = map_a2b.mapRowToCol( last_res )

            if (mfirst_res == 0 and old_f != 0) or (mlast_res == 0 and old_t != map_a2b.getRowTo() ):
                
                options.stderr.write("# mapping not possible for prediction %i on %s %s:%i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i\n" % \
                                     (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand,
                                      old_f, old_t,
                                      f, t,
                                      first_res, last_res,
                                      mfirst_res, mlast_res,
                                      f, t))
                
                options.stderr.write("# %s\n" % str(map_sbjcts[entry.mSbjctToken]))
                options.stderr.write("# %s\n" % str(entry))                
                options.stderr.flush()                
                nerrors_boundaries += 1
                is_error = True

                ## get extended boundaries for alignment later on
                while mfirst_res == 0 and first_res > 1:
                    first_res -= 1
                    mfirst_res = map_a2b.mapRowToCol(first_res)
                while mlast_res == 0 and last_res < map_a2b.getRowTo():
                    last_res += 1
                    mlast_res = map_a2b.mapRowToCol(last_res)

            ## convert to genomic coordinates            
            ## convert negative strand coordinates
            if is_positive:
                new_f = mfirst_res - 1
                new_t = mlast_res 
            else:
                new_f = mfirst_res
                new_t = mlast_res - 1
                
                new_f = map_a2b.getColTo() - new_f
                new_t = map_a2b.getColTo() - new_t

            ## Now map the alignment.
            try:
                MapAlignment( entry, map_a2b )
                
            except ValueError:
                options.stderr.write("# alignment mapping not possible for prediction %i on %s %s:%i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i\n" % \
                                     (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand,
                                      old_f, old_t,
                                      f, t,
                                      first_res, last_res,
                                      mfirst_res, mlast_res,
                                      new_f, new_t,
                                      entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo))
                options.stderr.write("# %s\n" % str(map_sbjcts[entry.mSbjctToken]))
                options.stderr.flush()
                nerrors_map += 1
                is_error= True
            
            if new_f != entry.mSbjctGenomeFrom or new_t != entry.mSbjctGenomeTo:
                options.stderr.write("# mapping inconsistency for prediction %i on %s %s:%i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i <> %i-%i\n" % \
                                     (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand,
                                      old_f, old_t,
                                      f, t,
                                      first_res, last_res,
                                      mfirst_res, mlast_res,                                      
                                      new_f, new_t,
                                      entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo))
                
                nerrors_inconsistencies += 1
                is_error = True

        ##########################################################################################################
        ## Map via offsets
        if entry.mSbjctToken in breakpoints:

            old_f, old_t = entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo

            ## convert to forward coordinates:
            if is_positive:
                f, t= old_f, old_t
            else:
                f, t = contig_sizes_old[entry.mSbjctToken] - old_t, contig_sizes_old[entry.mSbjctToken] - old_f

            o1 = GetOffset( f,
                            breakpoints[entry.mSbjctToken],
                            endpoints[entry.mSbjctToken],
                            offsets[entry.mSbjctToken] )
            o2 = GetOffset( t,
                            breakpoints[entry.mSbjctToken],
                            endpoints[entry.mSbjctToken],
                            offsets[entry.mSbjctToken] )            

            if o1 != o2:
                options.stderr.write("# break within gene %s\n" % str(entry))
                nerrors_map += 1
                is_error = True
                
            f += o1
            t += o2

            if not is_positive:
                f, t = contig_sizes_new[entry.mSbjctToken] - t, contig_sizes_new[entry.mSbjctToken] - f

            entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo = f, t

            if entry.mSbjctGenomeFrom > entry.mSbjctGenomeTo:
                options.stderr.write("# mapping error: start after end %s\n" % str(entry))
                nerrors_map += 1
                is_error = True
        
        ##########################################################################################################
        ## do translation check, if genome is given
        if options.genome_file:
            genomic_sequence = Genomics.GetGenomicSequence( new_sbjct_token, entry.mSbjctStrand,
                                                            entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo,
                                                            options.genome_file,
                                                            loglevel = 0)

            map_peptide2translation, translation = Genomics.Alignment2PeptideAlignment( \
                entry.mMapPeptide2Genome, entry.mQueryFrom, 0, genomic_sequence )

            if re.sub("X", "", translation) != re.sub("X", "", entry.mTranslation):
                options.stderr.write("# translation error for prediction %i on %s %s:%i-%i -> %i-%i <> %i-%i\n" % \
                                     (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand,
                                      old_f, old_t,
                                      f, t,
                                      entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo))
                if map_sbjcts:
                    options.stderr.write("# %s\n" % str(map_sbjcts[entry.mSbjctToken]))
                options.stderr.write("# old=%s\n# new=%s\n" % (entry.mTranslation, translation))
                options.stderr.write("# old=%s\n# new=%s\n" % (entry.mAlignmentString, Genomics.Alignment2String(entry.mMapPeptide2Genome)))                    
                nerrors_translation += 1
                is_error = True

                if peptides and entry.mQueryToken in peptides:
                    naligned += 1

                    options.stdlog.write( "# aligning: %s versus %s:%s: %i-%i\n" % ( \
                        entry.mQueryToken,
                        new_sbjct_token, entry.mSbjctStrand,
                        entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo))
                    
                    # do a quick reprediction
                    if entry.mQueryToken in peptides:
                        genomic_sequence = Genomics.GetGenomicSequence( new_sbjct_token, entry.mSbjctStrand,
                                                                        0, 0,
                                                                        genome_file = options.genome_pattern,                                                                        
                                                                        loglevel = 0)
                        predictor.mLogLevel = 0

                        
                        result =  predictor(entry.mQueryToken, peptides[entry.mQueryToken],
                                            entry.mSbjctToken, genomic_sequence,
                                            "--exhaustive --subopt FALSE --score '%s' " % str(80),
                                            new_f - 10, new_t + 10)
                        prediction_id = entry.mPredictionId
                        if result:
                            entry = result[0]
                            entry.mPredictionId = prediction_id
                            nerrors_realigned += 1
            else:
                if is_error:
                    nerrors_inconsequential += 1
                    
        entry.mSbjctToken = new_sbjct_token

        ## map query tokens
        if query_map_old2new:
            query_tokens = query_map_old2new[entry.mQueryToken]
        else:
            query_tokens = (entry.mQueryToken,)

        if options.skip_errors and is_error:
            continue

        for query_token in query_tokens:

            entry.mQueryToken = query_token
            
            prediction_id = entry.mPredictionId
            entry.mPredictionId = 0
            
            hid = Genomics.GetHID( str(entry) )
            if hid in output:
                nduplicates += 1
                continue
            
            noutput += 1                        
            if options.renumber: prediction_id = noutput

            entry.mPredictionId = prediction_id

            options.stdout.write( str(entry) + "\n")
            options.stdout.flush()
            found_transcripts[entry.mQueryToken] = 1

    ## write out found transcripts and genes
    nmissed_transcripts = 0
    missed_transcripts = []
    found_genes = {}
    if peptides:
        for x in peptides.keys():
            if x not in found_transcripts:
                nmissed_transcripts += 1
                missed_transcripts.append( x )
            else:
                found_genes[map_transcript2gene[x]] = 1

    missed_genes = {}
    nmissed_genes = 0
    if map_transcript2gene:

        for t in missed_transcripts:
            g = map_transcript2gene[t]
            if g not in found_genes:
                missed_genes[g] = 1
        nmissed_genes = len(missed_genes)
    
    if options.write_missed:
        outfile = open(options.write_missed, "w")
        for x in missed_transcripts:
            if x in unmapped:
                status = "unmapped"
            else:
                status = "mapped"
            outfile.write( "%s\t%s\t%s\n" % ("transcript", x, status ))
        for x in missed_genes:
            status = "unknown"
            outfile.write( "%s\t%s\t%s\n" % ("gene", x, status ))
        
        outfile.close()
        
    options.stdlog.write("# input=%i, output=%i, filtered=%i, nduplicates=%i, mapped=%i, errors=%i\n" % (\
         ninput, noutput, nfiltered, nduplicates, nmapped, nerrors ))
    options.stdlog.write("# errors: inconsequental=%i, boundaries=%i, mapping=%i, inconsistencies=%i, translation=%i, realigned=%i\n" % (\
       nerrors_inconsequential, nerrors_boundaries, nerrors_map, nerrors_inconsistencies, nerrors_translation, nerrors_realigned ))
    options.stdlog.write("# peptides: input=%i, found=%i, missed=%i, found_genes=%i, missed_genes=%i\n" % (\
        len(peptides), len(found_transcripts), nmissed_transcripts, len(found_genes), nmissed_genes) )
    
    E.Stop()
Exemplo n.º 10
0
def BuildLines(dbhandle,
               statement,
               genome_lengths,
               prefix="",
               default_color=None):

    c = dbhandle.cursor()
    c.execute(statement)

    if param_loglevel >= 2:
        print "# received %i results." % c.rowcount

    sbjct_token = ""
    sbjct_strand = None
    sbjct_from = 10000000000000000
    sbjct_to = 0

    lines = []

    nmatches = 0

    for line in c.fetchall():

        entry = PredictionParser.PredictionParserEntry()

        entry.FillFromTable(line)

        if not genome_lengths.has_key(entry.mSbjctToken):
            filename_genome = param_genome_file % entry.mSbjctToken
            forward_sequences, reverse_sequences = Genomics.ReadGenomicSequences(
                open(filename_genome, "r"))
            genome_lengths[entry.mSbjctToken] = (len(
                forward_sequences[entry.mSbjctToken]), 0)

        lgenome, offset = genome_lengths[entry.mSbjctToken]

        if param_loglevel >= 4:
            print "# lgenome=%i, offset=%i" % (lgenome, offset)

        # get cds information
        exons = []
        if param_tablename_exons:
            cc = dbhandle.cursor()

            statement = """SELECT exon_from, exon_to, exon_frame, genome_exon_from, genome_exon_to
            FROM %s WHERE prediction_id = %i""" % (
                param_tablename_exons,
                entry.mPredictionId,
            )

            if param_restrict_good_exons:
                statement += " AND is_ok = TRUE"

            try:
                cc.execute(statement)
                result = cc.fetchall()
            except pgdb.DatabaseError, msg:
                print "# query failed with message", msg
                result = []

            exons = result
            cc.close()

        if not exons:
            if entry.mMapPeptide2Genome:
                exons = Genomics.Alignment2ExonBoundaries(
                    entry.mMapPeptide2Genome,
                    query_from=entry.mQueryFrom - 1,
                    sbjct_from=entry.mSbjctGenomeFrom,
                    add_stop_codon=1)
            else:
                exons = [("", "", 0, entry.mSbjctGenomeFrom,
                          entry.mSbjctGenomeTo)]

        # select gene id
        if param_tablename_genes:
            cc = dbhandle.cursor()
            statement = """SELECT gene_id
            FROM %s WHERE prediction_id = %i""" % (param_tablename_genes,
                                                   entry.mPredictionId)

            try:
                cc.execute(statement)
                result = cc.fetchone()
            except pgdb.DatabaseError, msg:
                print "# query failed with message", msg
                result = None

            gene_id = result[0]
            dbhandle.commit()
            cc.close()
Exemplo n.º 11
0
	sbjct_genome_to, 
	map_query2genome
    FROM %s AS p 
    WHERE p.sbjct_token = '%s' AND
    p.sbjct_strand = '%s' AND 
    OVERLAP( %i, %i, p.sbjct_genome_from, sbjct_genome_to) > 0 
    """

    alignator = alignlib_lite.makeAlignatorDPFull(
        alignlib_lite.ALIGNMENT_LOCAL, param_gop, param_gep)
    map_reference2target = alignlib_lite.makeAlignmentVector()
    assignment_id = 0

    for line in cr.fetchall():

        reference = PredictionParser.PredictionParserEntry()
        reference.FillFromTable(line)

        ct = dbhandle.cursor()
        ct.execute(statement %
                   (param_tablename_predictions_target, reference.mSbjctToken,
                    reference.mSbjctStrand, reference.mSbjctGenomeFrom,
                    reference.mSbjctGenomeTo))

        reference_exons = Exons.Alignment2Exons(reference.mMapPeptide2Genome,
                                                0, reference.mSbjctFrom)

        for line2 in ct.fetchall():
            target = PredictionParser.PredictionParserEntry()
            target.FillFromTable(line2)
Exemplo n.º 12
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: gpipe/prediction2pairs.py 2031 2008-07-15 09:19:05Z andreas $", usage = globals()["__doc__"])

    parser.add_option( "-g", "--genome-file", dest="genome_file", type="string",
                       help="filename with genomic data (indexed)." )

    parser.add_option( "-c", "--cds", dest="filename_cds", type="string",
                       help="filename with cds seguences." )

    parser.add_option( "-f", "--format", dest="format", type="choice",
                       choices=("paired_fasta", ),
                       help="output format, valid options are: paired_fasta: concatenated pairwise alignments in FASTA format" )

    parser.set_defaults( 
        genome_file = "genome",
        filename_cds = "cds.fasta",
        format = "paired_fasta",
        filename_suffix = ".fasta",
        filename_prefix = "",
        )

    (options, args) = E.Start( parser, add_psql_options = True )    

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(1)

    fasta = IndexedFasta.IndexedFasta( options.genome_file )

    ## reading CDS sequences
    if options.filename_cds:
        cds_sequences = Genomics.ReadPeptideSequences( open(options.filename_cds, "r") )
    else:
        cds_sequences = {}
    
    if options.loglevel >= 1:
        options.stdlog.write( "# read %i CDS sequences\n" % len(cds_sequences) )

    last_filename_genome = None

    p = PredictionParser.PredictionParserEntry()    
    
    ninput, noutput, nsanity, n3, nlength = 0, 0, 0, 0, 0

    for line in options.stdin:
        
        if line[0] == "#": continue
        if line[0] == '"': continue
        
        p.Read(line)

        ninput += 1

        genomic_fragment = fasta.getSequence( p.mSbjctToken, p.mSbjctStrand,
                                              p.mSbjctGenomeFrom, p.mSbjctGenomeTo )

        if len(genomic_fragment) == 0:
            raise "ERROR: empty fragment %s:%s for line" % (p.mSbjctGenomeFrom, p.mSbjctGenomeTo), line
        
        try:
            cds_fragment = cds_sequences[p.mQueryToken]
        except KeyError:
            options.stdlog.write( "# ERROR: cds not found: query %s.\n" % p.mQueryToken )
            continue

        map_query2sbjct, genomic_fragment = Genomics.Alignment2CDNA( p.mMapPeptide2Genome,
                                                                     query_from = p.mQueryFrom,
                                                                     sbjct_from = 0,
                                                                     genome = genomic_fragment )

        ## check for errors:
        if map_query2sbjct.getRowTo() != p.mQueryTo * 3:
            options.stdlog.write( "# ERROR: boundary shift in query at line %s\n# %i %i\n" % (line, map_query2sbjct.getRowTo(), p.mQueryTo * 3 ) )

        if map_query2sbjct.getColTo() > len(genomic_fragment):
            options.stdlog.write(  "# ERROR: length mismatch in line %s\n# genomic fragment (%i) shorter than last aligned residue (%i)\n" %\
            (line, len(genomic_fragment), map_query2sbjct.getColTo()) )
            options.stdlog.write(  "# cds     %s\n# genomic %s\n" % (str( cds_fragment ), genomic_fragment ))
            nlength += 1
            continue
        
        if map_query2sbjct.getRowTo() > len(cds_fragment):
            options.stdlog.write(  "# ERROR: length mismatch in line %s\n# cds fragment (%i) shorter than last aligned residue (%i)\n" %\
            (line, len(cds_fragment), map_query2sbjct.getRowTo()) )
            options.stdlog.write(  "# cds     %s\n# genomic %s\n" % (str( cds_fragment ), genomic_fragment ))
            nlength += 1
            continue

        cds_seq = alignlib_lite.makeSequence( cds_fragment )
        genomic_seq = alignlib_lite.makeSequence( genomic_fragment )
        
        f = alignlib_lite.AlignmentFormatExplicit( map_query2sbjct, cds_seq, genomic_seq )
        row_ali = f.mRowAlignment
        col_ali = f.mColAlignment
        
        row_ali, col_ali = Genomics.RemoveFrameShiftsFromAlignment(row_ali, col_ali)
        
        row_ali = Genomics.MaskStopCodons( row_ali )
        col_ali = Genomics.MaskStopCodons( col_ali )        

        if len(row_ali) != len(col_ali):
            options.stdlog.write(  "# ERROR: wrong alignment lengths.\n" )
            sys.exit(1)
            
        if len(row_ali) % 3 or len(col_ali) % 3:
            options.stdlog.write( "# ERROR: sequences are not a multiple of 3 in line: %s\n" % line )
            options.stdlog.write( "# %6i %s\n# %6i %s\n" % (len(row_ali), str(row_ali), len(col_ali), str(col_ali) ) )
            n3 += 1

        input = re.sub( "[-X]", "", p.mTranslation )
        ref = re.sub( "[-X]", "", Genomics.TranslateDNA2Protein( col_ali ) )
        if input != ref:
            if options.loglevel >= 1:
                options.stdlog.write("# sanity check failed for %s - %s\n# %6i %s\n# %6i %s\n" % (p.mPredictionId, p.mQueryToken, 
                                                                                                  len(input), input, 
                                                                                                  len(ref), ref ) )
            nsanity += 1
            continue
        
        options.stdout.write(  ">%s\n%s\n" % (p.mPredictionId, row_ali) )
        options.stdout.write(  ">%s_vs_%s_%s_%i_%i\n%s\n" % \
              (p.mQueryToken, p.mSbjctToken, p.mSbjctStrand, p.mSbjctGenomeFrom, p.mSbjctGenomeTo, col_ali) ) 
        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write("# ninput=%i, noutput=%i, nsanity=%i, nlength=%i, n3=%i\n" % (ninput, noutput, nsanity, nlength, n3) )
                                  
    E.Stop()
Exemplo n.º 13
0
    # read peptide sequences
    if param_filename_peptides:
        peptide_sequences = Genomics.ReadPeptideSequences(
            open(param_filename_peptides, "r"))
    else:
        peptide_sequences = {}

    # print HEADER

    if param_loglevel >= 2:
        print SHORT_HEADER_SUMMARY

    # aligned entries from exonerate
    entries = []

    parser = PredictionParser.PredictionParserExonerate()

    if param_format == "exonerate":
        for line in sys.stdin:

            if line[0] == "#":
                continue
            if line[:3] != "diy":
                continue

            data = string.split(line[:-1], "\t")

            query_token = data[1]

            # parser has to go inside, because GetBestMatch returns reference
            # copy
Exemplo n.º 14
0
        if o in ("-v", "--verbose"):
            param_loglevel = int(a)
        elif o in ("--version", ):
            print "version="
            sys.exit(0)
        elif o in ("-h", "--help"):
            print USAGE
            sys.exit(0)
        elif o in ("-t", "--trans"):
            param_trans = 1

    print E.GetHeader()
    print E.GetParams()

    if param_trans:
        parser = PredictionParser.PredictionParserBlatTrans()
    else:
        parser = PredictionParser.PredictionParserBlatCDNA()

    nmatches = 1
    for line in sys.stdin:
        if line[0] == "#":
            continue
        if not re.match("^[0-9]", line):
            continue

        try:
            entries = parser.Parse((line, ))
        except PredictionParser.AlignmentError, e:
            print "# %s" % str(e)
            print "#", line[:-1]
Exemplo n.º 15
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: gpipe/gff2predictions.py 2021 2008-07-10 16:00:48Z andreas $",
        usage=globals()["__doc__"],
    )

    parser.add_option("-t", "--trans", dest="trans", help="input is translated DNA.", action="store_true")

    parser.add_option(
        "-f", "--format", dest="format", help="input format.", type="choice", choices=("exons", "psl", "gff")
    )

    parser.add_option(
        "-o",
        "--output-format",
        dest="output_format",
        help="output format",
        type="choice",
        choices=("exontable", "exons", "predictions", "cds", "fasta"),
    )

    parser.add_option(
        "-g", "--genome-file", dest="genome_file", type="string", help="filename with genomic data (indexed)."
    )

    parser.add_option(
        "--predictions-file",
        dest="predictions_file",
        type="string",
        help="filename with predictions. Use gene structures from this file if available.",
    )

    parser.add_option(
        "-i",
        "--gff-field-id",
        dest="gff_field_id",
        type="string",
        help="field for the feature id in the gff info section.",
    )

    parser.add_option(
        "-p",
        "--filename-peptides",
        dest="filename_peptides",
        type="string",
        help="Filename with peptide sequences. If given, it is used to check the predicted translated sequences.",
    )

    parser.add_option(
        "--no-realignment",
        dest="do_realignment",
        action="store_false",
        help="do not re-align entries that do not parse correctly.",
    )

    parser.add_option(
        "--remove-unaligned",
        dest="remove_unaligned",
        action="store_true",
        help="remove entries that have not been aligned correctly.",
    )

    parser.add_option(
        "--input-coordinates",
        dest="input_coordinates",
        type="string",
        help="specify input format for input coordinates [forward|both-zero|one-closed|open].",
    )

    parser.set_defaults(
        trans=False,
        output_format="predictions",
        format="psl",
        gff_field_id="id",
        input_coordinates="both-zero-open",
        filename_peptides=None,
        genome_file=None,
        do_realignment=True,
        predictions_file=None,
        remove_unaligned=False,
    )

    (options, args) = E.Start(parser)

    if not options.genome_file:
        raise "please specify a genome file."

    fasta = IndexedFasta.IndexedFasta(options.genome_file)
    contig_sizes = fasta.getContigSizes()

    ninput, noutput, nskipped = 0, 0, 0
    nfound, nnotfound, nidentical, nmismatch, naligned, nunaligned = 0, 0, 0, 0, 0, 0

    if options.filename_peptides:
        peptide_sequences = Genomics.ReadPeptideSequences(IOTools.openFile(options.filename_peptides, "r"))
        predictor = Predictor.PredictorExonerate()
        predictor.mLogLevel = 0
    else:
        peptide_sequences = None
        predictor = None

    converter = IndexedFasta.getConverter(options.input_coordinates)

    predictions = {}
    if options.predictions_file:
        parser = PredictionParser.iterator_predictions(IOTools.openFile(options.predictions_file, "r"))
        for p in parser:
            predictions[p.mPredictionId] = p

    if options.output_format == "predictions":

        if options.format == "psl":

            if options.trans:
                parser = PredictionParser.PredictionParserBlatTrans()
            else:
                parser = PredictionParser.PredictionParserBlatCDNA()

            nmatches = 1
            for line in sys.stdin:
                if line[0] == "#":
                    continue
                if not re.match("^[0-9]", line):
                    continue

                try:
                    entries = parser.Parse((line,))
                except PredictionParser.AlignmentError, e:
                    print "# %s" % str(e)
                    print "#", line[:-1]
                    sys.exit(1)

                for entry in entries:
                    entry.mPredictionId = nmatches
                    nmatches += 1

                print str(entries)

        elif options.format == "exons":
            parser = PredictionParser.PredictionParserExons(contig_sizes=contig_sizes)
        else:
            raise "unknown format %s for output option %s" % (options.format, options.output_format)

        if options.loglevel >= 2:
            options.stdlog.write("# parsing.\n")
            options.stdlog.flush()

        results = parser.Parse(sys.stdin.readlines())

        if options.loglevel >= 2:
            options.stdlog.write("# parsing finished.\n")
            options.stdlog.flush()

        if options.loglevel >= 1:
            options.stdlog.write(
                "# parsing: ninput=%i, noutput=%i, nerrors=%i\n"
                % (parser.GetNumInput(), parser.GetNumOutput(), parser.GetNumErrors())
            )

            for error, msg in parser.mErrors:
                options.stdlog.write("# %s : %s\n" % (str(error), msg))
                options.stdlog.flush()

        # if genomes are given: build translation
        if options.genome_file:

            results.Sort(lambda x, y: cmp(x.mSbjctToken, y.mSbjctToken))

            new_results = PredictionParser.Predictions()

            for entry in results:

                ninput += 1

                if options.loglevel >= 2:
                    options.stdlog.write(
                        "# processing entry %s:%s on %s:%s %i/%i.\n"
                        % (
                            entry.mPredictionId,
                            entry.mQueryToken,
                            entry.mSbjctToken,
                            entry.mSbjctStrand,
                            ninput,
                            len(results),
                        )
                    )
                    options.stdlog.flush()

                try:
                    lgenome = fasta.getLength(entry.mSbjctToken)
                    # added 3 residues - was a problem at split codons just before the stop.
                    # See for example the chicken sequence ENSGALP00000002741
                    genomic_sequence = fasta.getSequence(
                        entry.mSbjctToken,
                        entry.mSbjctStrand,
                        entry.mSbjctGenomeFrom,
                        min(entry.mSbjctGenomeTo + 3, lgenome),
                    )

                except KeyError:
                    if options.loglevel >= 1:
                        options.stdlog.write(
                            "# did not find entry for %s on %s.\n" % (entry.mPredictionId, entry.mSbjctToken)
                        )
                    nskipped += 1
                    continue

                if predictions and entry.mPredictionId in predictions:
                    if options.loglevel >= 2:
                        options.stdlog.write(
                            "# substituting entry %s on %s:%s.\n"
                            % (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand)
                        )
                        options.stdlog.flush()
                    entry = predictions[entry.mPredictionId]

                exons = Exons.Alignment2Exons(entry.mMapPeptide2Genome, 0, entry.mSbjctGenomeFrom)

                entry.mMapPeptide2Translation, entry.mTranslation = Genomics.Alignment2PeptideAlignment(
                    Genomics.String2Alignment(entry.mAlignmentString), entry.mQueryFrom, 0, genomic_sequence
                )

                entry.score = entry.mMapPeptide2Translation.getColTo() - entry.mMapPeptide2Translation.getColFrom() + 1

                (
                    entry.mNIntrons,
                    entry.mNFrameShifts,
                    entry.mNGaps,
                    entry.mNSplits,
                    entry.mNStopCodons,
                    entry.mNDisruptions,
                ) = Genomics.CountGeneFeatures(0, entry.mMapPeptide2Genome, genomic_sequence)

                if peptide_sequences:

                    if str(entry.mPredictionId) in peptide_sequences:

                        reference = peptide_sequences[str(entry.mPredictionId)].upper()

                        translation = entry.mTranslation
                        nfound += 1

                        is_identical, nmismatches = checkIdentity(reference, translation, options)

                        if is_identical:
                            nidentical += 1
                        else:
                            nmismatch += 1

                            if options.do_realignment:
                                if options.loglevel >= 2:
                                    options.stdlog.write(
                                        "# %s: mismatches..realigning in region %i:%i\n"
                                        % (entry.mPredictionId, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo)
                                    )
                                    options.stdlog.flush()

                                    result = predictor(
                                        entry.mPredictionId,
                                        reference,
                                        entry.mSbjctToken,
                                        genomic_sequence,
                                        "--subopt FALSE --score '%s'" % str(80),
                                    )
                                    # "--exhaustive --subopt FALSE --score '%s'" % str(80) )

                                    if result:
                                        translation = result[0].mTranslation
                                        is_identical, nmismatches = checkIdentity(reference, translation, options)
                                    else:
                                        if options.loglevel >= 2:
                                            options.stdlog.write(
                                                "# %s: realignment returned empty result\n" % (entry.mPredictionId)
                                            )
                                            options.stdlog.flush()
                                        is_identical = False

                                    if is_identical:
                                        naligned += 1
                                        prediction_id = entry.mPredictionId
                                        sbjct_genome_from = entry.mSbjctGenomeFrom
                                        entry = result[0]
                                        entry.mPredictionId = prediction_id
                                        entry.mSbjctGenomeFrom += sbjct_genome_from
                                    else:
                                        nunaligned += 1
                                        if options.loglevel >= 1:
                                            options.stdlog.write(
                                                "# %s: mismatch on %s:%s:%i-%i after realignment\n# reference =%s\n# translated=%s\n# realigned =%s\n"
                                                % (
                                                    entry.mPredictionId,
                                                    entry.mSbjctToken,
                                                    entry.mSbjctStrand,
                                                    entry.mSbjctGenomeFrom,
                                                    entry.mSbjctGenomeTo,
                                                    reference,
                                                    entry.mTranslation,
                                                    translation,
                                                )
                                            )
                                            options.stdlog.flush()
                                        if options.remove_unaligned:
                                            nskipped += 1
                                            continue

                            else:
                                if options.loglevel >= 2:
                                    options.stdlog.write(
                                        "# %s: mismatches on %s ... no realignment\n"
                                        % (entry.mPredictionId, entry.mSbjctToken)
                                    )
                                    if options.loglevel >= 3:
                                        options.stdlog.write(
                                            "# %s: mismatch before realignment\n# reference =%s\n# translated=%s\n"
                                            % (entry.mPredictionId, reference, translation)
                                        )
                                    options.stdlog.flush()

                                if options.remove_unaligned:
                                    nskipped += 1
                                    continue

                    else:
                        nnotfound += 1

                new_results.append(entry)
                noutput += 1

            results = new_results
        if results:
            options.stdout.write(str(results) + "\n")
Exemplo n.º 16
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gpipe/predictions2cds.py 1858 2008-05-13 15:07:05Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome.")

    parser.add_option("-o",
                      "--forward-coordinates",
                      dest="forward_coordinates",
                      action="store_true",
                      help="input uses forward coordinates.")

    parser.add_option("-f",
                      "--format",
                      dest="format",
                      type="choice",
                      choices=("default", "cds", "cdnas", "map", "gff",
                               "intron-fasta", "exons"),
                      help="output format.")

    parser.add_option("-r",
                      "--reset-to-start",
                      dest="reset_to_start",
                      action="store_true",
                      help="move genomic coordinates to begin from 0.")

    parser.add_option("--reset-query",
                      dest="reset_query",
                      action="store_true",
                      help="move peptide coordinates to begin from 0.")

    parser.set_defaults(genome_file=None,
                        forward_coordinates=False,
                        format="default",
                        reset_to_start=False,
                        reset_query=False)

    (options, args) = E.Start(parser, add_pipe_options=True)

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(2)

    cds_id = 1

    entry = PredictionParser.PredictionParserEntry()

    fasta = IndexedFasta.IndexedFasta(options.genome_file)

    ninput, noutput, nskipped, nerrors = 0, 0, 0, 0

    for line in sys.stdin:

        if line[0] == "#":
            continue
        if line.startswith("id"):
            continue

        ninput += 1

        try:
            entry.Read(line)
        except ValueError, msg:
            options.stdlog.write("# parsing failed with msg %s in line %s" %
                                 (msg, line))
            nerrors += 1
            continue

        cds = Exons.Alignment2Exons(entry.mMapPeptide2Genome,
                                    query_from=entry.mQueryFrom,
                                    sbjct_from=entry.mSbjctGenomeFrom,
                                    add_stop_codon=0)

        for cd in cds:
            cd.mSbjctToken = entry.mSbjctToken
            cd.mSbjctStrand = entry.mSbjctStrand

        if cds[-1].mGenomeTo != entry.mSbjctGenomeTo:
            options.stdlog.write(
                "# WARNING: discrepancy in exon calculation!!!\n")
            for cd in cds:
                options.stdlog.write("# %s\n" % str(cd))
            options.stdlog.write("# %s\n" % entry)

        lsequence = fasta.getLength(entry.mSbjctToken)
        genomic_sequence = fasta.getSequence(entry.mSbjctToken,
                                             entry.mSbjctStrand,
                                             entry.mSbjctGenomeFrom,
                                             entry.mSbjctGenomeTo)

        # deal with forward coordinates: convert them to negative strand
        # coordinates
        if options.forward_coordinates and \
                entry.mSbjctStrand == "-":
            entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo = lsequence - \
                entry.mSbjctGenomeTo, lsequence - entry.mSbjctGenomeFrom
            for cd in cds:
                cd.InvertGenomicCoordinates(lsequence)

        # attach sequence to cds
        for cd in cds:
            start = cd.mGenomeFrom - entry.mSbjctGenomeFrom
            end = cd.mGenomeTo - entry.mSbjctGenomeFrom
            cd.mSequence = genomic_sequence[start:end]

        # reset coordinates for query
        if options.reset_to_start:
            offset = entry.mPeptideFrom
            for cd in cds:
                cd.mPeptideFrom -= offset
                cd.mPeptideTo -= offset

        # play with coordinates
        if options.reset_to_start:
            offset = entry.mSbjctGenomeFrom
            for cd in cds:
                cd.mGenomeFrom -= offset
                cd.mGenomeTo -= offset
        else:
            offset = 0

        if options.format == "cds":
            rank = 0
            for cd in cds:
                rank += 1
                cd.mQueryToken = entry.mQueryToken
                cd.mSbjctToken = entry.mSbjctToken
                cd.mSbjctStrand = entry.mSbjctStrand
                cd.mRank = rank
                print str(cd)

        if options.format == "exons":
            rank = 0
            for cd in cds:
                rank += 1
                options.stdout.write("\t".join(
                    map(str, (entry.mPredictionId, cd.mSbjctToken,
                              cd.mSbjctStrand, rank, cd.frame, cd.mPeptideFrom,
                              cd.mPeptideTo, cd.mGenomeFrom, cd.mGenomeTo))) +
                                     "\n")

        elif options.format == "cdnas":
            print string.join(
                map(str,
                    (entry.mPredictionId, entry.mQueryToken, entry.mSbjctToken,
                     entry.mSbjctStrand, entry.mSbjctGenomeFrom - offset,
                     entry.mSbjctGenomeTo - offset, genomic_sequence)), "\t")

        elif options.format == "map":

            map_prediction2genome = alignlib_lite.makeAlignmentSet()

            for cd in cds:
                alignlib_lite.addDiagonal2Alignment(
                    map_prediction2genome, cd.mPeptideFrom + 1, cd.mPeptideTo,
                    (cd.mGenomeFrom - offset) - cd.mPeptideFrom)

            print string.join(
                map(str, (entry.mPredictionId, entry.mSbjctToken,
                          entry.mSbjctStrand,
                          alignlib_lite.AlignmentFormatEmissions(
                              map_prediction2genome))), "\t")

        elif options.format == "intron-fasta":
            rank = 0
            if len(cds) == 1:
                nskipped += 1
                continue

            last = cds[0].mGenomeTo
            for cd in cds[1:]:
                rank += 1
                key = "%s %i %s:%s:%i:%i" % (
                    entry.mPredictionId, rank, entry.mSbjctToken,
                    entry.mSbjctStrand, last, entry.mSbjctGenomeFrom)
                sequence = genomic_sequence[last - entry.mSbjctGenomeFrom:cd.
                                            mGenomeFrom -
                                            entry.mSbjctGenomeFrom]
                options.stdout.write(">%s\n%s\n" % (key, sequence))
                last = cd.mGenomeTo

        elif options.format == "gff-match":
            print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\tTarget \"%s\" %i %i; Score %i; Introns %i; Frameshifts %i; Stops %i" % \
                  (entry.mSbjctToken,
                   "gpipe", "similarity",
                   entry.mSbjctGenomeFrom,
                   entry.mSbjctGenomeTo,
                   entry.mPercentIdentity,
                   entry.mSbjctStrand,
                   ".",
                   entry.mQueryToken,
                   entry.mQueryFrom,
                   entry.mQueryTo,
                   entry.score,
                   entry.mNIntrons,
                   entry.mNFrameShifts,
                   entry.mNStopCodons)

        elif options.format == "gff-exon":
            rank = 0
            for cd in cds:
                rank += 1
                print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\tTarget \"%s\" %i %i; Score %i; Rank %i/%i; Prediction %i" % \
                      (entry.mSbjctToken,
                       "gpipe", "similarity",
                       cd.mGenomeFrom,
                       cd.mGenomeTo,
                       entry.mPercentIdentity,
                       entry.mSbjctStrand,
                       ".",
                       entry.mQueryToken,
                       cd.mPeptideFrom / 3 + 1,
                       cd.mPeptideTo / 3 + 1,
                       entry.score,
                       rank,
                       len(cds),
                       entry.mPredictionId)
        else:
            exon_from = 0
            for cd in cds:
                cd.mPeptideFrom = exon_from
                exon_from += cd.mGenomeTo - cd.mGenomeFrom
                cd.mPeptideTo = exon_from
                print string.join(
                    map(str, (cds_id, entry.mPredictionId, cd.mPeptideFrom,
                              cd.mPeptideTo, cd.frame, cd.mGenomeFrom,
                              cd.mGenomeTo, cd.mSequence)), "\t")
                cds_id += 1

        noutput += 1
Exemplo n.º 17
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gpipe/compare_predictions2exons.py 2011 2008-07-04 10:40:51Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome.")

    parser.add_option("-b",
                      "--boundaries",
                      dest="filename_boundaries",
                      type="string",
                      help="filename with exon boundaries.")

    parser.add_option("-e",
                      "--exons",
                      dest="filename_exons",
                      type="string",
                      help="filename with exons (output).")

    parser.add_option("-p",
                      "--peptides",
                      dest="filename_peptides",
                      type="string",
                      help="filename with peptide sequences.")

    parser.add_option(
        "-w",
        "--write-notfound",
        dest="write_notfound",
        action="store_true",
        help="print exons for predictions not found in reference.")

    parser.add_option("-q",
                      "--quality-pide",
                      dest="quality_threshold_pide",
                      type="int",
                      help="quality threshold (pide) for exons.")

    parser.set_defaults(
        genome_file="genome",
        filename_boundaries=None,
        filename_exons=None,
        filename_peptides=None,
        quality_threshold_pide=0,
        write_notfound=False,
        ## allowed number of nucleotides for exon boundaries to
        ## be considered equivalent.
        slipping_exon_boundary=9,
        ## stop codons to search for
        stop_codons=("TAG", "TAA", "TGA"),
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(2)

    reference_exon_boundaries = {}
    if options.filename_boundaries:
        reference_exon_boundaries = Exons.ReadExonBoundaries(open(
            options.filename_boundaries, "r"),
                                                             do_invert=1,
                                                             remove_utr=1)
        E.info("read exon boundaries for %i queries" %
               len(reference_exon_boundaries))

    if options.filename_exons:
        outfile_exons = open(options.filename_exons, "w")
        outfile_exons.write("%s\n" % "\t".join(
            ("prediction_id", "exon_id", "exon_from", "exon_to", "exon_frame",
             "reference_id", "reference_from", "reference_to",
             "reference_phase", "pidentity", "psimilarity", "nframeshifts",
             "ngaps", "nstopcodons", "is_ok", "genome_exon_from",
             "genome_exon_to")))

    else:
        outfile_exons = None

    if options.filename_peptides:
        peptide_sequences = Genomics.ReadPeptideSequences(
            open(options.filename_peptides, "r"))
        E.info("read peptide sequences for %i queries" %
               len(peptide_sequences))
    else:
        peptide_sequences = {}

    entry = PredictionParser.PredictionParserEntry()
    last_filename_genome = None

    nfound, nmissed_exons, nmissed_length = 0, 0, 0
    nempty_alignments = 0

    fasta = IndexedFasta.IndexedFasta(options.genome_file)

    options.stdout.write("%s\n" % "\t".join(
        ("prediction_id", "number", "dubious_exons", "boundaries_sum",
         "boundaries_max", "identical_exons", "inserted_exons",
         "deleted_exons", "inserted_introns", "deleted_introns",
         "truncated_Nterminus", "truncated_Cterminus", "deleted_Nexons",
         "deleted_Cexons", "inserted_Nexons", "inserted_Cexons")))

    for line in sys.stdin:

        if line[0] == "#": continue

        try:
            entry.Read(line)
        except ValueError, msg:
            print "# parsing failed with msg %s in line %s" % (msg, line[:-1])
            sys.exit(1)

        exons = Genomics.Alignment2ExonBoundaries(
            entry.mMapPeptide2Genome,
            query_from=entry.mQueryFrom,
            sbjct_from=entry.mSbjctGenomeFrom,
            add_stop_codon=0)

        if exons[-1][4] != entry.mSbjctGenomeTo:
            print "# WARNING: discrepancy in exon calculation!!!"
            for e in exons:
                print "#", str(e)
            print "#", str(entry)

        if options.loglevel >= 5:
            for e in exons:
                print "#", str(e)

        genomic_fragment = fasta.getSequence(entry.mSbjctToken,
                                             entry.mSbjctStrand,
                                             entry.mSbjctGenomeFrom,
                                             entry.mSbjctGenomeTo)

        skip = False
        if peptide_sequences.has_key(entry.mQueryToken):

            query_sequence = alignlib_lite.makeSequence(
                peptide_sequences[entry.mQueryToken])
            sbjct_sequence = alignlib_lite.makeSequence(entry.mTranslation)

            percent_similarity, percent_identity = 0, 0
            if query_sequence.getLength(
            ) < entry.mMapPeptide2Translation.getRowTo():
                print "# WARNING: query sequence %s is too short: %i %i" % (
                    entry.mQueryToken, query_sequence.getLength(),
                    entry.mMapPeptide2Translation.getRowTo())
                sys.stdout.flush()
                nmissed_length += 1
                skip = True

            elif sbjct_sequence.getLength(
            ) < entry.mMapPeptide2Translation.getColTo():
                print "# WARNING: sbjct sequence %s is too short: %i %i" % (
                    entry.mSbjctToken, sbjct_sequence.getLength(),
                    entry.mMapPeptide2Translation.getColTo())
                sys.stdout.flush()
                nmissed_length += 1
                skip = True
            else:
                alignlib_lite.rescoreAlignment(
                    entry.mMapPeptide2Translation, query_sequence,
                    sbjct_sequence,
                    alignlib_lite.makeScorer(query_sequence, sbjct_sequence))
                percent_identity = alignlib_lite.calculatePercentIdentity(
                    entry.mMapPeptide2Translation, query_sequence,
                    sbjct_sequence) * 100
                percent_similarity = alignlib_lite.calculatePercentSimilarity(
                    entry.mMapPeptide2Translation) * 100

            E.debug(
                "prediction %s: percent identity/similarity: before=%5.2f/%5.2f, realigned=%5.2f/%5.2f"
                %
                (str(entry.mPredictionId), entry.mPercentSimilarity,
                 entry.mPercentIdentity, percent_similarity, percent_identity))

        else:
            query_sequence = None
            sbjct_sequence = None

        # default values
        exons_num_exons = "na"
        exons_boundaries_sum = "na"
        exons_boundaries_max = "na"
        dubious_exons = "na"

        ndeleted_exons, ninserted_exons, ndeleted_introns, ninserted_introns, nidentical_exons = 0, 0, 0, 0, 0
        truncated_Nterminal_exon, truncated_Cterminal_exon = 0, 0
        ndeleted_Nexons, ndeleted_Cexons = 0, 0
        ninserted_Nexons, ninserted_Cexons = 0, 0

        exons_offset = exons[0][3]

        if not reference_exon_boundaries.has_key(entry.mQueryToken):
            print "# WARNING: sequence %s has no exon boundaries" % (
                entry.mQueryToken)
            sys.stdout.flush()
            nmissed_exons += 1
            skip = True

        if not skip:

            nfound += 1

            ref_exons = reference_exon_boundaries[entry.mQueryToken]

            ref_exons_offset = ref_exons[0].mGenomeFrom

            exons_num_exons = len(ref_exons) - len(exons)
            exons_boundaries_sum = 0
            exons_phase = 0
            exons_boundaries_max = 0
            dubious_exons = 0

            inserted_exons = 0
            temp_inserted_exons = 0

            if options.loglevel >= 3:
                for e in exons:
                    options.stdlog.write("# %s\n" % str(e))
                for e in ref_exons:
                    options.stdlog.write("# %s\n" % str(e))

            min_pide = entry.mPercentIdentity * options.quality_threshold_pide / 100

            in_sync = 0
            e, r = 0, 0

            while e < len(exons) and r < len(ref_exons):

                this_e, this_r = e + 1, r + 1
                percent_identity = 0
                percent_similarity = 0
                is_good_exon = 0

                if options.loglevel >= 4:
                    options.stdlog.write("# current exons: %i and %i\n" %
                                         (e, r))
                    sys.stdout.flush()

                exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to, exon_ali = exons[
                    e][0:6]
                ref_from, ref_to, ref_phase, ref_genome_from, ref_genome_to = (
                    ref_exons[r].mPeptideFrom, ref_exons[r].mPeptideTo,
                    ref_exons[r].frame, ref_exons[r].mGenomeFrom,
                    ref_exons[r].mGenomeTo)

                ref_genome_from -= ref_exons_offset
                ref_genome_to -= ref_exons_offset

                ## get percent identity for exon
                exon_percent_identity = 0
                exon_percent_similarity = 0

                if query_sequence and sbjct_sequence:

                    tmp_ali = alignlib_lite.makeAlignmentVector()

                    xquery_from = exon_from / 3
                    xquery_to = exon_to / 3

                    alignlib_lite.copyAlignment(tmp_ali,
                                                entry.mMapPeptide2Translation,
                                                xquery_from, xquery_to)

                    if tmp_ali.getLength() == 0:
                        options.stdlog.write(
                            "# WARNING: empty alignment %s\n" % str(
                                (ref_from, exon_from, ref_to, exon_to,
                                 xquery_from, xquery_to)))
                        nempty_alignments += 1
                    else:
                        if options.loglevel >= 5:
                            options.stdlog.write("# %s\n" % str(
                                alignlib_lite.AlignmentFormatExplicit(
                                    tmp_ali, query_sequence, sbjct_sequence)))

                        exon_percent_identity = alignlib_lite.calculatePercentIdentity(
                            tmp_ali, query_sequence, sbjct_sequence) * 100
                        exon_percent_similarity = alignlib_lite.calculatePercentSimilarity(
                            tmp_ali) * 100

                if exon_percent_identity >= min_pide:
                    is_good_exon = 1
                else:
                    is_good_exon = 0

                if e < len(exons) - 1:
                    (next_exon_from, next_exon_to, next_exon_phase,
                     next_exon_genome_from, next_exon_genome_to,
                     next_exon_ali) = exons[e + 1][0:6]
                else:
                    (next_exon_from, next_exon_to, next_exon_phase,
                     next_exon_genome_from, next_exon_genome_to,
                     next_exon_ali) = 0, 0, 0, 0, 0, []

                if r < len(ref_exons) - 1:
                    next_ref_from, next_ref_to, next_ref_phase = (
                        ref_exons[r + 1].mPeptideFrom,
                        ref_exons[r + 1].mPeptideTo, ref_exons[r + 1].frame)
                else:
                    next_ref_from, next_ref_to, next_ref_phase = 0, 0, 0

                if options.loglevel >= 2:
                    options.stdlog.write("# %s\n" % "\t".join(
                        map(str, (entry.mQueryToken, exon_from, exon_to,
                                  exon_phase, exon_genome_from, exon_genome_to,
                                  ref_from, ref_to, ref_phase))))
                    sys.stdout.flush()

                # beware of small exons.
                # if less than options.slipping_exon_boundary: boundary is 0
                # check if end is more than options.splipping_exon_boundary apart as well.
                if exon_to - exon_from <= options.slipping_exon_boundary or \
                        ref_to - ref_from <= options.slipping_exon_boundary:
                    boundary = 0
                else:
                    boundary = options.slipping_exon_boundary

                if ref_to <= exon_from + boundary and \
                   ref_to <= exon_to - options.slipping_exon_boundary:
                    ## no overlap
                    is_good_exon = 0
                    if e == 0:
                        ndeleted_Nexons += 1
                    else:
                        ndeleted_exons += 1
                    r += 1
                    exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to = 0, 0, 0, 0, 0
                    overlap = 0
                elif exon_to <= ref_from + boundary and \
                         exon_to <= ref_to - options.slipping_exon_boundary:
                    ## no overlap
                    is_good_exon = 0
                    if r == 0:
                        ninserted_Nexons += 1
                    else:
                        ninserted_exons += 1
                    e += 1
                    ref_from, ref_to, ref_phase = 0, 0, 0
                    overlap = 0
                else:
                    ## overlap
                    overlap = 1
                    dfrom = int(math.fabs(exon_from - ref_from))
                    dto = int(math.fabs(exon_to - ref_to))

                    ## get percent identity for overlapping fragment
                    if query_sequence and sbjct_sequence:
                        ## this the problem
                        tmp_ali = alignlib_lite.makeAlignmentVector()

                        xquery_from = max(ref_from / 3, exon_from / 3)
                        xquery_to = min(ref_to / 3, exon_to / 3)

                        alignlib_lite.copyAlignment(
                            tmp_ali, entry.mMapPeptide2Translation,
                            xquery_from, xquery_to)

                        if tmp_ali.getLength() == 0:
                            options.stdlog.write(
                                "# warning: empty alignment %s\n" % str(
                                    (ref_from, exon_from, ref_to, exon_to,
                                     xquery_from, xquery_to)))
                            percent_identity = 0
                            percent_similarity = 0
                        else:
                            if options.loglevel >= 5:
                                print str(
                                    alignlib_lite.AlignmentFormatExplicit(
                                        tmp_ali, query_sequence,
                                        sbjct_sequence))

                            percent_identity = alignlib_lite.calculatePercentIdentity(
                                tmp_ali, query_sequence, sbjct_sequence) * 100
                            percent_similarity = alignlib_lite.calculatePercentSimilarity(
                                tmp_ali) * 100

                    if percent_identity >= min_pide:
                        is_good_exon = 1
                    else:
                        is_good_exon = 0
                        dubious_exons += 1

                    ## adjust regions for terminal exons
                    if e == 0 and r == 0 and dfrom <= (entry.mQueryFrom -
                                                       1) * 3 and dfrom > 0:
                        if is_good_exon:
                            truncated_Nterminal_exon = dfrom
                        dfrom = 0

                    ## truncated terminal exons
                    if e == len(exons) - 1 and r == len(
                            ref_exons) - 1 and dto <= (
                                entry.mQueryLength -
                                entry.mQueryTo) * 3 and dto > 0:
                        if is_good_exon:
                            truncated_Cterminal_exon = dto
                        dto = 0

                    ## do not count deviations for terminal query exons
                    if e == 0 and dfrom <= entry.mQueryFrom * 3 and dfrom > 0:
                        dfrom = 0

                    if e == len(exons) - 1 and dto <= (
                            entry.mQueryLength -
                            entry.mQueryTo) * 3 and dto > 0:
                        dto = 0

                    ## permit difference of one codon (assumed to be stop)
                    if e == len(exons) - 1 and r == len(
                            ref_exons) - 1 and dto == 3:
                        dto = 0

                    ## deal with different boundary conditions:
                    if dfrom == 0 and dto == 0:
                        if is_good_exon: nidentical_exons += 1
                        e += 1
                        r += 1
                    ## next exon within this ref_exon
                    elif exon_to < ref_to and next_exon_to and next_exon_to <= ref_to + options.slipping_exon_boundary:
                        if is_good_exon: ninserted_introns += 1
                        e += 1
                        in_sync = 1
                        dto = 0
                    ## next ref_exon within this exon
                    elif ref_to < exon_to and next_ref_to and next_ref_to <= exon_to + options.slipping_exon_boundary:
                        if is_good_exon: ndeleted_introns += 1
                        r += 1
                        in_sync = 1
                        dto = 0
                    else:
                        e += 1
                        r += 1
                        if in_sync:
                            dfrom = 0

                    if is_good_exon:
                        exons_boundaries_sum += dfrom + dto
                        exons_boundaries_max = max(dfrom, exons_boundaries_max)
                        exons_boundaries_max = max(dto, exons_boundaries_max)

                    ###########################################################
                    ## count inserted/deleted introns and misplaced boundaries
                    ##
                    ## if exon and next_exon in ref_exon: inserted intron
                    ## if ref_exon and next_ref_exon in exon: deleted intron

                if outfile_exons:

                    if genomic_fragment and exon_genome_to:
                        nintrons, nframeshifts, ngaps, nsplits, nstopcodons, disruptions = Genomics.CountGeneFeatures(
                            exon_genome_from - entry.mSbjctGenomeFrom,
                            exon_ali,
                            genomic_fragment,
                            border_stop_codon=0)
                    else:
                        nintrons, nframeshifts, ngaps, nsplits, nstopcodons = 0, 0, 0, 0, 0

                    if exon_to == 0: this_e = 0
                    if ref_to == 0: this_r = 0
                    outfile_exons.write(
                        string.join(
                            map(str, (
                                entry.mPredictionId,
                                this_e,
                                exon_from,
                                exon_to,
                                exon_phase,
                                this_r,
                                ref_from,
                                ref_to,
                                ref_phase,
                                percent_identity,
                                percent_similarity,
                                nframeshifts,
                                ngaps,
                                nstopcodons,
                                is_good_exon,
                                exon_genome_from,
                                exon_genome_to,
                            )), "\t") + "\n")

            while e < len(exons):
                exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to = exons[
                    e][0:5]
                e += 1
                ninserted_Cexons += 1

                if outfile_exons:
                    outfile_exons.write(
                        string.join(
                            map(str, (
                                entry.mPredictionId,
                                e,
                                exon_from,
                                exon_to,
                                exon_phase,
                                0,
                                0,
                                0,
                                0,
                                0,
                                0,
                                0,
                                0,
                                0,
                                1,
                                exon_genome_from,
                                exon_genome_to,
                            )), "\t") + "\n")

            while r < len(ref_exons):
                ref_from, ref_to, ref_phase, ref_genome_from, ref_genome_to = (
                    ref_exons[r].mPeptideFrom, ref_exons[r].mPeptideTo,
                    ref_exons[r].frame, ref_exons[r].mGenomeFrom,
                    ref_exons[r].mGenomeTo)
                ndeleted_Cexons += 1
                ref_genome_from -= ref_exons_offset
                ref_genome_to -= ref_exons_offset
                r += 1
                if outfile_exons:
                    outfile_exons.write(
                        string.join(
                            map(str, (
                                entry.mPredictionId,
                                0,
                                0,
                                0,
                                0,
                                r,
                                ref_from,
                                ref_to,
                                ref_phase,
                                0,
                                0,
                                0,
                                0,
                                0,
                                0,
                                0,
                                0,
                            )), "\t") + "\n")
        else:
            if options.write_notfound:
                this_e = 0
                ## use prediction's identity/similarity for exons.
                ## This will still then flag stop-codons in later analysis
                percent_identity = entry.mPercentIdentity
                percent_similarity = entry.mPercentSimilarity

                for exon in exons:
                    this_e += 1
                    exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to, exon_ali = exon[
                        0:6]
                    if genomic_fragment:
                        nintrons, nframeshifts, ngaps, nsplits, nstopcodons, disruptions = Genomics.CountGeneFeatures(
                            exon_genome_from - entry.mSbjctGenomeFrom,
                            exon_ali, genomic_fragment)

                    outfile_exons.write(
                        string.join(
                            map(str, (
                                entry.mPredictionId,
                                this_e,
                                exon_from,
                                exon_to,
                                exon_phase,
                                0,
                                0,
                                0,
                                0,
                                percent_identity,
                                percent_similarity,
                                nframeshifts,
                                ngaps,
                                nstopcodons,
                                1,
                                exon_genome_from,
                                exon_genome_to,
                            )), "\t") + "\n")

        options.stdout.write("\t".join(
            map(str, (entry.mPredictionId, exons_num_exons, dubious_exons,
                      exons_boundaries_sum, exons_boundaries_max,
                      nidentical_exons, ninserted_exons, ndeleted_exons,
                      ninserted_introns, ndeleted_introns,
                      truncated_Nterminal_exon, truncated_Cterminal_exon,
                      ndeleted_Nexons, ndeleted_Cexons, ninserted_Nexons,
                      ninserted_Cexons))) + "\n")
Exemplo n.º 18
0
    print E.GetHeader()
    print E.GetParams()

    # reading CDS sequences
    if param_filename_cds:
        cds_sequences = Genomics.ReadPeptideSequences(
            open(param_filename_cds, "r"))
    else:
        cds_sequences = {}

    if param_loglevel >= 1:
        print "# read %i CDS sequences" % len(cds_sequences)

    last_filename_genome = None

    p = PredictionParser.PredictionParserEntry()
    for line in sys.stdin:

        if line[0] == "#":
            continue
        if line[0] == '"':
            continue

        p.Read(line)

        # read genomic sequence
        if "%s" in param_genome_file:
            filename_genome = param_genome_file % p.mSbjctToken
        else:
            filename_genome = param_genome_file
Exemplo n.º 19
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gpipe/predictions2introns.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome.")

    parser.add_option("-o",
                      "--output-filename-summary",
                      dest="output_filename_summary",
                      type="string",
                      help="filename with summary information.")

    parser.add_option("--skip-header",
                      dest="skip_header",
                      action="store_true",
                      help="skip header.")

    parser.add_option(
        "--fill-introns",
        dest="fill_introns",
        type="int",
        help=
        "fill intron if divisible by three and no stop codon up to a maximum length of #."
    )

    parser.add_option(
        "--introns-max-stops",
        dest="introns_max_stops",
        type="int",
        help="maximum number of stop codons to tolerate within an intron.")

    parser.add_option("--output-format",
                      dest="output_format",
                      type="choice",
                      choices=("predictions", "extensions", "filled-introns"),
                      help="output format.")

    parser.set_defaults(
        genome_file="genome",
        start_codons=("ATG"),
        stop_codons=("TAG", "TAA", "TGA"),
        skip_header=False,
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(2)

    fasta = IndexedFasta.IndexedFasta(options.genome_file)

    p = PredictionParser.PredictionParserEntry()

    ninput, noutput = 0, 0
    nfilled = 0
    nseqs_filled = 0
    nseqs_extended = 0
    left_extensions = []
    right_extensions = []
    filled_introns = []

    if not options.skip_header:
        options.stdout.write("\t".join((
            "prediction_id",
            "intron",
            "contig",
            "strand",
            "start",
            "end",
            "length",
            "nstops",
            "type",
            "prime5",
            "prime3",
        )) + "\n")

    for line in sys.stdin:

        if line[0] == "#": continue

        ninput += 1
        p.Read(line)

        lsequence = fasta.getLength(p.mSbjctToken)

        genomic_sequence = fasta.getSequence(p.mSbjctToken, p.mSbjctStrand,
                                             p.mSbjctGenomeFrom,
                                             p.mSbjctGenomeTo).upper()

        exons = Exons.Alignment2Exons(p.mMapPeptide2Genome,
                                      query_from=0,
                                      sbjct_from=0)

        new_exons = []

        last_e = exons[0]

        nintron = 0

        for e in exons[1:]:

            nintron += 1
            lintron = e.mGenomeFrom - last_e.mGenomeTo

            intron_is_l3 = lintron % 3 != 0

            if intron_is_l3:
                ## get sequence, include also residues from split codons
                ## when checking for stop codons.
                ## note that e.mAlignment can sometimes be empty. This might
                ## be an exonerate bug. In the alignment string there are two
                ## consecutive exons.
                if e.mAlignment and last_e.mAlignment and e.mAlignment[0][
                        0] == "S":
                    offset_left = last_e.mAlignment[-1][2]
                    offset_right = e.mAlignment[0][2]
                else:
                    offset_left, offset_right = 0, 0

                sequence = genomic_sequence[last_e.mGenomeTo -
                                            offset_left:e.mGenomeFrom +
                                            offset_right]

                intron_nstops = 0
                for codon in [
                        sequence[x:x + 3] for x in range(0, len(sequence), 3)
                ]:
                    if codon in options.stop_codons:
                        intron_nstops += 1
            else:
                intron_nstops = 0

            ## check for splice signals
            sequence = genomic_sequence[last_e.mGenomeTo:e.mGenomeFrom]

            intron_type, prime5, prime3 = Genomics.GetIntronType(sequence)

            if options.loglevel >= 2:
                options.stdlog.write( "\t".join(map(str, (p.mPredictionId,
                                                          nintron,
                                                          lintron,
                                                          intron_nstops,
                                                          intron_type,
                                                          genomic_sequence[last_e.mGenomeTo-6:last_e.mGenomeTo].lower() + "|" + sequence[:5] + "..." +\
                                                          sequence[-5:] + "|" + genomic_sequence[e.mGenomeFrom:e.mGenomeFrom+6].lower()) ) ) + "\n" )

            options.stdout.write("\t".join(
                map(str, (p.mPredictionId, nintron, p.mSbjctToken,
                          p.mSbjctStrand,
                          last_e.mGenomeTo + p.mSbjctGenomeFrom,
                          e.mGenomeFrom + p.mSbjctGenomeFrom, lintron,
                          intron_nstops, intron_type, prime5, prime3))) + "\n")

            last_e = e

        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write("# ninput=%i, noutput=%i.\n" % (\
            ninput, noutput))

    E.Stop()
Exemplo n.º 20
0
    ninput, noutput, nskipped = 0,0,0
    nfound, nnotfound, nidentical, nmismatch, naligned, nunaligned = 0,0,0,0,0,0
    
    if options.filename_peptides:
        peptide_sequences = Genomics.ReadPeptideSequences( IOTools.openFile( options.filename_peptides, "r"))
        predictor = PredictorExonerate()
        predictor.mLogLevel = 0
    else:
        peptide_sequences = None
        predictor = None
        
    converter = IndexedFasta.getConverter( options.input_coordinates )

    predictions = {}
    if options.predictions_file:
        parser = PredictionParser.iterator_predictions( IOTools.openFile( options.predictions_file, "r") )
        for p in parser:
            predictions[p.mPredictionId] = p
        
    if options.output_format == "predictions":
        
        if options.format == "psl":

            if options.trans:
                parser = PredictionParser.PredictionParserBlatTrans()
            else:
                parser = PredictionParser.PredictionParserBlatCDNA()
                
            nmatches = 1
            for line in sys.stdin:
                if line[0] == "#": continue