Python Exons.Alignment2Exons примеры использования

Язык программирования: Python

Пространство имен/Пакет: CGAT

Класс/Тип: Exons

Метод/Функция: Alignment2Exons

Примеров на hotexamples.com: 8

Python Exons.Alignment2Exons - 8 примеров найдено. Это лучшие примеры Python кода для CGAT.Exons.Alignment2Exons, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

ReadExonBoundaries(13)

Alignment2Exons(8)

CheckOverlap(3)

ComparisonResult(3)

SetRankToPositionFlag(2)

CheckCoverage(2)

CheckCoverageAinB(2)

ClusterByExonIdentity(2)

CompareGeneStructures(2)

Alignment2ExonBoundaries(1)

GetExonsRange(1)

MatchExons(1)

GetPeptideLengths(1)

GetGenomeLengths(1)

CountMissedBoundaries(1)

GetExonBoundariesFromTable(1)

Exons2Alignment(1)

Exon(1)

ClusterByExonOverlap(1)

CheckContainedAinB(1)

CalculateStats(1)

UpdatePeptideCoordinates(1)

Пример #1

Показать файл

Файл: regions2graph.py Проект: santayana/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/regions2graph.py 2754 2009-09-04 16:50:22Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-b",
                      "--benchmark",
                      dest="filename_benchmark",
                      type="string",
                      help="")

    parser.add_option("-y",
                      "--benchmark-synonyms",
                      dest="benchmark_synonyms",
                      type="string",
                      help="")

    parser.add_option("-p",
                      "--peptides",
                      dest="filename_peptides",
                      type="string",
                      help="")

    parser.add_option("-c",
                      "--min-coverage-query",
                      dest="min_coverage_query",
                      type="float",
                      help="")

    parser.add_option("-s",
                      "--min-score",
                      dest="min_total_score",
                      type="float",
                      help="")

    parser.add_option("-i",
                      "--min-percent-identity",
                      dest="min_percent_identity",
                      type="float",
                      help="")

    parser.add_option("-o",
                      "--max-percent-overlap",
                      dest="max_percent_overlap",
                      type="float",
                      help="")

    parser.add_option("--overlap-min-score",
                      dest="overlap_min_score",
                      type="float",
                      help="")

    parser.add_option("--overlap-min-coverage",
                      dest="overlap_min_coverage",
                      type="float",
                      help="")

    parser.add_option("--overlap-min-identity",
                      dest="overlap_min_identity",
                      type="float",
                      help="")

    parser.add_option("--overlap-max-coverage",
                      dest="overlap_max_coverage",
                      type="float",
                      help="")

    parser.add_option("-m",
                      "--max-matches",
                      dest="max_matches",
                      type="int",
                      help="")

    parser.add_option("-j",
                      "--join-regions",
                      dest="join_regions",
                      type="int",
                      help="")

    parser.add_option("--join-regions-max-regions",
                      dest="join_regions_max_regions",
                      type="int",
                      help="")

    parser.add_option("--join-regions-max-coverage",
                      dest="join_regions_max_coverage",
                      type="float",
                      help="")

    parser.add_option("--min-length", dest="min_length", type="int", help="")

    parser.add_option("--test", dest="test", type="int", help="")

    parser.add_option("--filter-queries",
                      dest="filename_filter_queries",
                      type="string",
                      help="")

    parser.add_option("--filter-regions",
                      dest="filter_regions",
                      type="string",
                      help="")

    parser.add_option("--conserve-memory",
                      dest="conserve_memory",
                      action="store_true",
                      help="")

    parser.add_option("--filter-suboptimal",
                      dest="filter_suboptimal",
                      action="store_true",
                      help="")

    parser.set_defaults(
        # overlap allowed for matches on genomic region
        max_percent_overlap=20,
        gop=-10.0,
        gep=-2.0,
        # thresholds for joining regions
        overlap_min_score=80,
        overlap_min_coverage=80,
        overlap_max_coverage=90,
        overlap_min_identity=50,
        # threshold for filtering bad predictions:
        # minimum score
        min_total_score=80,
        # joining regions
        join_regions=0,
        # maximum coverage of query for predictions to be joined
        # (This is to ensure not to join duplications. A range check
        # would be better, but runs into trouble with repeats).
        join_regions_max_coverage=90,
        # minimum coverage of query
        min_coverage_query=10,
        # conserve memory
        conserve_memory=0,
        # minimum percent identity
        min_percent_identity=0,
        # minimum length
        min_length=0,
        max_matches=0,
        filename_peptides=None,
        filename_filter_queries=None,
        # turn on/off various filters
        filter_suboptimal=False,
        filter_regions=False,
        # parameters for filter of suboptimal predictions
        min_relative_coverage=0.5,
        min_relative_score=0.5,
        min_relative_percent_identity=0.5,
        # minimum difference between non-correlated conflicts to keep them
        # both.
        conflicts_min_difference=0.1,
        # benchmarking data
        benchmarks=None,
        benchmark_synonyms=None,
        filename_benchmark=None,
        filename_benchmark_synonyms=None,
        test=None,
        max_intron=50000)

    (options, args) = E.Start(parser, add_pipe_options=True)

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(2)

    ##########################################################################
    # read filtering
    filter_queries = {}
    if options.filename_filter_queries:
        for line in open(options.filename_filter_queries, "r"):
            if line[0] == "#":
                continue
            query_token = line[:-1].split("\t")[0]
            filter_queries[query_token] = True

    if options.loglevel >= 1:
        options.stdlog.write("# filtering for %i queries.\n" %
                             len(filter_queries))

    ##########################################################################
    # read benchmarking regions
    if options.filename_benchmark:
        options.benchmarks = ReadBenchmarkingRegions(
            open(options.filename_benchmark, "r"))
        if options.loglevel >= 1:
            options.stdlog.write(
                "# read benchmarking regions for %i tokens\n" %
                len(options.benchmarks))
            sys.stdout.flush()
        if options.filename_benchmark_synonyms:
            infile = open(options.filename_benchmark_synonyms, "r")
            options.benchmark_synonyms = {}
            for line in infile:
                if line[0] == "#":
                    continue
                value, key = line[:-1].split("\t")
                options.benchmark_synonyms[key] = value
        else:
            options.benchmark_synonyms = {}
    else:
        options.benchmarks = {}
        options.benchmark_synonyms = {}

    ##########################################################################
    # read peptide sequences
    if options.filename_peptides:
        peptide_sequences = Genomics.ReadPeptideSequences(
            open(options.filename_peptides, "r"))
    else:
        peptide_sequences = {}

    if options.conserve_memory:
        old_predictions, filename_old_predictions = tempfile.mkstemp()
        os.close(old_predictions)
        old_predictions = PredictionFile.PredictionFile()
        old_predictions.open(filename_old_predictions, "w")
    else:
        # array with final predictions
        old_predictions = []

    if options.loglevel >= 1:
        options.stdlog.write("# reading predictions.\n")
        sys.stdout.flush()

    nread = 0
    ninput = 0
    for line in sys.stdin:

        if line[0] == "#":
            continue

        entry = PredictionParser.PredictionParserEntry(expand=0)
        entry.Read(line)
        nread += 1

        # set prediction id
        if not entry.mPredictionId:
            entry.mPredictionId = nread

        # filter bad predictions right here in order to save memory:
        if entry.score < options.min_total_score:
            if options.loglevel >= 3:
                options.stdlog.write(
                    "# PRUNING: reason: score below minimum: removing: %s\n" %
                    str(entry))
            continue
        elif entry.mQueryCoverage < options.min_coverage_query:
            if options.loglevel >= 3:
                options.stdlog.write(
                    "# PRUNING: reason: coverage below minimum: removing: %s\n"
                    % str(entry))
            continue
        elif entry.mPercentIdentity < options.min_percent_identity:
            if options.loglevel >= 3:
                options.stdlog.write(
                    "# PRUNING: reason: percent identity below minimum: removing: %s\n"
                    % str(entry))
            continue
        elif entry.mSbjctTo - entry.mSbjctFrom < options.min_length:
            if options.loglevel >= 3:
                options.stdlog.write(
                    "# PRUNING: reason: length of transcript below minimum: removing: %s\n"
                    % str(entry))
            continue

        ninput += 1

        if options.test and ninput > options.test:
            break

        old_predictions.append(entry)

    if options.loglevel >= 1:
        options.stdlog.write("# predictions after input: %i\n" % ninput)
        sys.stdout.flush()

    if options.loglevel >= 10:

        options.stdlog.write(
            "############## start: predictions after input ###################################\n"
        )
        for x in old_predictions:
            options.stdlog.write("# %s\n" % str(x))
        options.stdlog.write(
            "############## end: predictions after input #####################################\n"
        )
        sys.stdout.flush()

    if ninput == 0:
        options.stdlog.write("# ERROR: no predictions\n")
        sys.exit(1)

    ##########################################################################
    # set up stacks of regions
    if options.conserve_memory:
        old_predictions.close()
        old_predictions.open(mode="r")
        removed_predictions, filename_removed_predictions = tempfile.mkstemp()
        os.close(removed_predictions)
        removed_predictions = PredictionFile.PredictionFile()
        removed_predictions.open(filename_removed_predictions, "w")

        new_predictions, filename_new_predictions = tempfile.mkstemp()
        os.close(new_predictions)
        new_predictions = PredictionFile.PredictionFile()
        new_predictions.open(filename_new_predictions, "w")
    else:
        removed_predictions = []
        new_predictions = []

    if options.benchmarks:
        EvaluateBenchmark(old_predictions)

    ##########################################################################
    # join regions
    if options.join_regions and options.join_regions_max_coverage:
        if options.loglevel >= 1:
            options.stdlog.write(
                "# joining regions: maximum distance between segments = %i and maximum query coverage = %i\n"
                % (options.join_regions, options.join_regions_max_coverage))
            sys.stdout.flush()
        njoined = JoinRegions(old_predictions, new_predictions)
        if options.conserve_memory:
            ExchangeStreams(old_predictions, new_predictions)
        else:
            old_predictions = new_predictions
            new_predictions = []

        if options.loglevel >= 1:
            options.stdlog.write("# predictions after joining: %i\n" % njoined)
            sys.stdout.flush()

        if options.loglevel >= 10:
            options.stdlog.write(
                "############## start: predictions after joining ###################################\n"
            )
            for x in old_predictions:
                options.stdlog.write("# %s" % str(x))
            options.stdlog.write(
                "############## end: predictions after joining #####################################\n"
            )
            sys.stdout.flush()
    else:
        if options.loglevel >= 1:
            options.stdlog.write("# joining regions: skipped\n")
            sys.stdout.flush()

        njoined = ninput

    ##########################################################################
    # build map of best predictions
    if options.filter_suboptimal:
        if options.loglevel >= 1:
            options.stdlog.write("# calculating best predictions\n")
            sys.stdout.flush()
        best_predictions = GetBestPredictions(old_predictions)
    else:
        best_predictions = {}

    if options.loglevel >= 1:
        options.stdlog.write("# calculated best predictions: %i\n" %
                             len(best_predictions))
        sys.stdout.flush()

    ##########################################################################
    # get regions to eliminate
    filter_regions = {}
    if options.filter_regions:

        entry = PredictionParser.PredictionParserEntry(expand=0)

        filenames = options.filter_regions.split(",")

        for filename in filenames:
            if options.loglevel >= 1:
                options.stdlog.write("# reading regions to filter from %s.\n" %
                                     (filename))
                sys.stdout.flush()

            if filename.endswith(".gz"):
                infile = gzip.open(filename, "r")
            else:
                infile = open(filename, "r")

            for line in infile:

                if line[0] == "#":
                    continue

                entry.Read(line)

                exons = Exons.Alignment2Exons(
                    Genomics.String2Alignment(entry.mAlignmentString),
                    entry.mQueryFrom, entry.mSbjctGenomeFrom)

                key = "%s-%s" % (entry.mSbjctToken, entry.mSbjctStrand)

                if key not in filter_regions:
                    filter_regions[key] = []

                for exon in exons:
                    filter_regions[key].append(
                        (exon.mGenomeFrom, exon.mGenomeTo))

            infile.close()

        for k in filter_regions.keys():
            filter_regions[k].sort()

    ##########################################################################
    # bipartite graph construction

    ##########################################################################
    # sort predictions by genomic region
    if options.conserve_memory:
        old_predictions.sort(('mSbjctToken', 'mSbjctStrand',
                              'mSbjctGenomeFrom', 'mSbjctGenomeTo'))
    else:
        old_predictions.sort(lambda x, y: cmp(
            (x.mSbjctToken, x.mSbjctStrand, x.mSbjctGenomeFrom, x.
             mSbjctGenomeTo), (y.mSbjctToken, y.mSbjctStrand, y.
                               mSbjctGenomeFrom, y.mSbjctGenomeTo)))

    ##########################################################################
    # filter predictions and resolve conflicts based on genomic overlap
    # deleted segments are put in a temporary storage space.
    min_from, max_from = None, None
    min_to, max_to = None, None
    region_id = 0
    noverlaps = 0
    last_prediction = None
    predictions = []
    region = Region()
    nclusters = 0
    neliminated_suboptimal = 0
    neliminated_overlap = 0

    noutput, nfiltered = 0, 0

    for this_prediction in old_predictions:

        # Filter 1: skip suboptimal predictions
        if this_prediction.mQueryToken in best_predictions:

            best_prediction = best_predictions[this_prediction.mQueryToken]

            neliminated_suboptimal += 1
            if float(
                    this_prediction.mQueryCoverage
            ) / best_prediction.mQueryCoverage < options.min_relative_coverage:
                if options.loglevel >= 2:
                    options.stdlog.write(
                        "# PRUNING: reason: coverage below best: removing %s\n"
                        % str(this_prediction))
                continue

            if float(this_prediction.score
                     ) / best_prediction.score < options.min_relative_score:
                if options.loglevel >= 2:
                    options.stdlog.write(
                        "# PRUNING: reason: score below best: removing %s\n" %
                        str(this_prediction))
                continue

            if float(
                    this_prediction.mPercentIdentity
            ) / best_prediction.mPercentIdentity < options.min_relative_percent_identity:
                if options.loglevel >= 2:
                    options.stdlog.write(
                        "# PRUNING: reason: percent identity below best: removing %s\n"
                        % str(this_prediction))
                continue

            neliminated_suboptimal -= 1

        # Filter 2: remove predictions overlapping with certain segments
        key = "%s-%s" % (this_prediction.mSbjctToken,
                         this_prediction.mSbjctStrand)

        if key in filter_regions:

            exons = Exons.Alignment2Exons(
                Genomics.String2Alignment(this_prediction.mAlignmentString),
                this_prediction.mQueryFrom, this_prediction.mSbjctGenomeFrom)

            if CheckOverlap(map(lambda x: (x.mGenomeFrom, x.mGenomeTo), exons),
                            filter_regions[key]):
                if options.loglevel >= 2:
                    options.stdlog.write(
                        "# PRUNING: reason: overlapping with taboo region: removing %s\n"
                        % str(this_prediction))
                neliminated_overlap += 1
                continue

        try:
            this_query_peptide, this_query_status, this_query_gene, this_query_transcript = \
                re.split("\s+", this_prediction.mQueryToken)
        except ValueError:
            this_query_gene = None

        # process first entry
        if min_from is None:
            min_from = this_prediction.mSbjctGenomeFrom
            max_from = this_prediction.mSbjctGenomeFrom
            max_to = this_prediction.mSbjctGenomeTo
            min_to = this_prediction.mSbjctGenomeTo
            predictions.append(this_prediction)
            last_prediction = this_prediction
            continue

        overlap = min_to > this_prediction.mSbjctGenomeFrom and \
            last_prediction.mSbjctToken == this_prediction.mSbjctToken and \
            last_prediction.mSbjctStrand == this_prediction.mSbjctStrand

        if options.loglevel >= 4:
            options.stdlog.write("# from=%i, to=%i, working on: %s\n" %
                                 (min_from, max_to, str(this_prediction)))
            options.stdlog.flush()

        # resolve overlap between different genes
        if overlap:
            noverlaps += 1
        else:
            region.mSbjctToken = last_prediction.mSbjctToken
            region.mSbjctStrand = last_prediction.mSbjctStrand
            region.mSbjctGenomeFrom = min_from
            region.mSbjctGenomeTo = max_to

            region_id, nxoutput, nxfiltered = ProcessRegion(
                predictions, region_id, region, peptide_sequences,
                filter_queries)

            noutput += nxoutput
            nfiltered += nxfiltered
            nclusters += 1
            predictions = []
            min_from = this_prediction.mSbjctGenomeFrom
            max_from = this_prediction.mSbjctGenomeFrom
            min_to = this_prediction.mSbjctGenomeTo
            max_to = this_prediction.mSbjctGenomeTo

        predictions.append(this_prediction)

        min_from = min(min_from, this_prediction.mSbjctGenomeFrom)
        max_from = max(max_from, this_prediction.mSbjctGenomeFrom)
        min_to = min(min_to, this_prediction.mSbjctGenomeTo)
        max_to = max(max_to, this_prediction.mSbjctGenomeTo)

        last_prediction = this_prediction

    if last_prediction:
        region.mSbjctToken = last_prediction.mSbjctToken
        region.mSbjctStrand = last_prediction.mSbjctStrand
        region.mSbjctGenomeFrom = min_from
        region.mSbjctGenomeTo = max_to

        region_id, nxoutput, nxfiltered = ProcessRegion(
            predictions, region_id, region, peptide_sequences, filter_queries)
        noutput += nxoutput
        nfiltered += nxfiltered

        nclusters += 1

    if options.conserve_memory:
        os.remove(filename_old_predictions)
        os.remove(filename_new_predictions)
        os.remove(filename_removed_predictions)

    if options.loglevel >= 1:
        options.stdlog.write(
            "# pairs: nread=%i, input=%i, joined=%i, clusters=%i, regions=%i, eliminated_subopt=%i, eliminated_overlap=%i, noutput=%i, nfiltered=%i\n"
            %
            (nread, ninput, njoined, nclusters, region_id,
             neliminated_suboptimal, neliminated_overlap, noutput, nfiltered))

    E.Stop()

Пример #2

Показать файл

Файл: gff2predictions.py Проект: santayana/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gpipe/gff2predictions.py 2021 2008-07-10 16:00:48Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--trans",
                      dest="trans",
                      help="input is translated DNA.",
                      action="store_true")

    parser.add_option("-f",
                      "--format",
                      dest="format",
                      help="input format.",
                      type="choice",
                      choices=("exons", "psl", "gff"))

    parser.add_option("-o",
                      "--output-format",
                      dest="output_format",
                      help="output format",
                      type="choice",
                      choices=('exontable', 'exons', 'predictions', 'cds',
                               'fasta'))

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genomic data (indexed).")

    parser.add_option(
        "--predictions-file",
        dest="predictions_file",
        type="string",
        help=
        "filename with predictions. Use gene structures from this file if available."
    )

    parser.add_option("-i",
                      "--gff-field-id",
                      dest="gff_field_id",
                      type="string",
                      help="field for the feature id in the gff info section.")

    parser.add_option(
        "-p",
        "--filename-peptides",
        dest="filename_peptides",
        type="string",
        help=
        "Filename with peptide sequences. If given, it is used to check the predicted translated sequences."
    )

    parser.add_option(
        "--no-realignment",
        dest="do_realignment",
        action="store_false",
        help="do not re-align entries that do not parse correctly.")

    parser.add_option(
        "--remove-unaligned",
        dest="remove_unaligned",
        action="store_true",
        help="remove entries that have not been aligned correctly.")

    parser.add_option(
        "--input-coordinates",
        dest="input_coordinates",
        type="string",
        help=
        "specify input format for input coordinates [forward|both-zero|one-closed|open]."
    )

    parser.set_defaults(trans=False,
                        output_format="predictions",
                        format="psl",
                        gff_field_id='id',
                        input_coordinates="both-zero-open",
                        filename_peptides=None,
                        genome_file=None,
                        do_realignment=True,
                        predictions_file=None,
                        remove_unaligned=False)

    (options, args) = E.Start(parser)

    if not options.genome_file:
        raise "please specify a genome file."

    fasta = IndexedFasta.IndexedFasta(options.genome_file)
    contig_sizes = fasta.getContigSizes()

    ninput, noutput, nskipped = 0, 0, 0
    nfound, nnotfound, nidentical, nmismatch, naligned, nunaligned = 0, 0, 0, 0, 0, 0

    if options.filename_peptides:
        peptide_sequences = Genomics.ReadPeptideSequences(
            IOTools.openFile(options.filename_peptides, "r"))
        predictor = Predictor.PredictorExonerate()
        predictor.mLogLevel = 0
    else:
        peptide_sequences = None
        predictor = None

    converter = IndexedFasta.getConverter(options.input_coordinates)

    predictions = {}
    if options.predictions_file:
        parser = PredictionParser.iterator_predictions(
            IOTools.openFile(options.predictions_file, "r"))
        for p in parser:
            predictions[p.mPredictionId] = p

    if options.output_format == "predictions":

        if options.format == "psl":

            if options.trans:
                parser = PredictionParser.PredictionParserBlatTrans()
            else:
                parser = PredictionParser.PredictionParserBlatCDNA()

            nmatches = 1
            for line in sys.stdin:
                if line[0] == "#":
                    continue
                if not re.match("^[0-9]", line):
                    continue

                try:
                    entries = parser.Parse((line, ))
                except PredictionParser.AlignmentError, e:
                    print "# %s" % str(e)
                    print "#", line[:-1]
                    sys.exit(1)

                for entry in entries:
                    entry.mPredictionId = nmatches
                    nmatches += 1

                print str(entries)

        elif options.format == "exons":
            parser = PredictionParser.PredictionParserExons(
                contig_sizes=contig_sizes)
        else:
            raise "unknown format %s for output option %s" % (
                options.format, options.output_format)

        if options.loglevel >= 2:
            options.stdlog.write("# parsing.\n")
            options.stdlog.flush()

        results = parser.Parse(sys.stdin.readlines())

        if options.loglevel >= 2:
            options.stdlog.write("# parsing finished.\n")
            options.stdlog.flush()

        if options.loglevel >= 1:
            options.stdlog.write(
                "# parsing: ninput=%i, noutput=%i, nerrors=%i\n" %
                (parser.GetNumInput(), parser.GetNumOutput(),
                 parser.GetNumErrors()))

            for error, msg in parser.mErrors:
                options.stdlog.write("# %s : %s\n" % (str(error), msg))
                options.stdlog.flush()

        # if genomes are given: build translation
        if options.genome_file:

            results.Sort(lambda x, y: cmp(x.mSbjctToken, y.mSbjctToken))

            new_results = PredictionParser.Predictions()

            for entry in results:

                ninput += 1

                if options.loglevel >= 2:
                    options.stdlog.write(
                        "# processing entry %s:%s on %s:%s %i/%i.\n" %
                        (entry.mPredictionId, entry.mQueryToken,
                         entry.mSbjctToken, entry.mSbjctStrand, ninput,
                         len(results)))
                    options.stdlog.flush()

                try:
                    lgenome = fasta.getLength(entry.mSbjctToken)
                    # added 3 residues - was a problem at split codons just before the stop.
                    # See for example the chicken sequence ENSGALP00000002741
                    genomic_sequence = fasta.getSequence(
                        entry.mSbjctToken, entry.mSbjctStrand,
                        entry.mSbjctGenomeFrom,
                        min(entry.mSbjctGenomeTo + 3, lgenome))

                except KeyError:
                    if options.loglevel >= 1:
                        options.stdlog.write(
                            "# did not find entry for %s on %s.\n" %
                            (entry.mPredictionId, entry.mSbjctToken))
                    nskipped += 1
                    continue

                if predictions and entry.mPredictionId in predictions:
                    if options.loglevel >= 2:
                        options.stdlog.write(
                            "# substituting entry %s on %s:%s.\n" %
                            (entry.mPredictionId, entry.mSbjctToken,
                             entry.mSbjctStrand))
                        options.stdlog.flush()
                    entry = predictions[entry.mPredictionId]

                exons = Exons.Alignment2Exons(entry.mMapPeptide2Genome, 0,
                                              entry.mSbjctGenomeFrom)

                entry.mMapPeptide2Translation, entry.mTranslation = Genomics.Alignment2PeptideAlignment(
                    Genomics.String2Alignment(entry.mAlignmentString),
                    entry.mQueryFrom, 0, genomic_sequence)

                entry.score = entry.mMapPeptide2Translation.getColTo(
                ) - entry.mMapPeptide2Translation.getColFrom() + 1

                (entry.mNIntrons, entry.mNFrameShifts, entry.mNGaps, entry.mNSplits, entry.mNStopCodons, entry.mNDisruptions ) = \
                    Genomics.CountGeneFeatures(0,
                                               entry.mMapPeptide2Genome,
                                               genomic_sequence)

                if peptide_sequences:

                    if str(entry.mPredictionId) in peptide_sequences:

                        reference = peptide_sequences[str(
                            entry.mPredictionId)].upper()

                        translation = entry.mTranslation
                        nfound += 1

                        is_identical, nmismatches = checkIdentity(
                            reference, translation, options)

                        if is_identical:
                            nidentical += 1
                        else:
                            nmismatch += 1

                            if options.do_realignment:
                                if options.loglevel >= 2:
                                    options.stdlog.write(
                                        "# %s: mismatches..realigning in region %i:%i\n"
                                        % (entry.mPredictionId,
                                           entry.mSbjctGenomeFrom,
                                           entry.mSbjctGenomeTo))
                                    options.stdlog.flush()

                                    result = predictor(
                                        entry.mPredictionId, reference,
                                        entry.mSbjctToken, genomic_sequence,
                                        "--subopt FALSE --score '%s'" %
                                        str(80))
                                    # "--exhaustive --subopt FALSE --score '%s'" % str(80) )

                                    if result:
                                        translation = result[0].mTranslation
                                        is_identical, nmismatches = checkIdentity(
                                            reference, translation, options)
                                    else:
                                        if options.loglevel >= 2:
                                            options.stdlog.write(
                                                "# %s: realignment returned empty result\n"
                                                % (entry.mPredictionId))
                                            options.stdlog.flush()
                                        is_identical = False

                                    if is_identical:
                                        naligned += 1
                                        prediction_id = entry.mPredictionId
                                        sbjct_genome_from = entry.mSbjctGenomeFrom
                                        entry = result[0]
                                        entry.mPredictionId = prediction_id
                                        entry.mSbjctGenomeFrom += sbjct_genome_from
                                    else:
                                        nunaligned += 1
                                        if options.loglevel >= 1:
                                            options.stdlog.write(
                                                "# %s: mismatch on %s:%s:%i-%i after realignment\n# reference =%s\n# translated=%s\n# realigned =%s\n"
                                                %
                                                (entry.mPredictionId,
                                                 entry.mSbjctToken,
                                                 entry.mSbjctStrand,
                                                 entry.mSbjctGenomeFrom,
                                                 entry.mSbjctGenomeTo,
                                                 reference, entry.mTranslation,
                                                 translation))
                                            options.stdlog.flush()
                                        if options.remove_unaligned:
                                            nskipped += 1
                                            continue

                            else:
                                if options.loglevel >= 2:
                                    options.stdlog.write(
                                        "# %s: mismatches on %s ... no realignment\n"
                                        % (
                                            entry.mPredictionId,
                                            entry.mSbjctToken,
                                        ))
                                    if options.loglevel >= 3:
                                        options.stdlog.write(
                                            "# %s: mismatch before realignment\n# reference =%s\n# translated=%s\n"
                                            % (entry.mPredictionId, reference,
                                               translation))
                                    options.stdlog.flush()

                                if options.remove_unaligned:
                                    nskipped += 1
                                    continue

                    else:
                        nnotfound += 1

                new_results.append(entry)
                noutput += 1

            results = new_results
        if results:
            options.stdout.write(str(results) + "\n")

Пример #3

Показать файл

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gpipe/predictions2introns.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome.")

    parser.add_option("-o",
                      "--output-filename-summary",
                      dest="output_filename_summary",
                      type="string",
                      help="filename with summary information.")

    parser.add_option("--skip-header",
                      dest="skip_header",
                      action="store_true",
                      help="skip header.")

    parser.add_option(
        "--fill-introns",
        dest="fill_introns",
        type="int",
        help=
        "fill intron if divisible by three and no stop codon up to a maximum length of #."
    )

    parser.add_option(
        "--introns-max-stops",
        dest="introns_max_stops",
        type="int",
        help="maximum number of stop codons to tolerate within an intron.")

    parser.add_option("--output-format",
                      dest="output_format",
                      type="choice",
                      choices=("predictions", "extensions", "filled-introns"),
                      help="output format.")

    parser.set_defaults(
        genome_file="genome",
        start_codons=("ATG"),
        stop_codons=("TAG", "TAA", "TGA"),
        skip_header=False,
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(2)

    fasta = IndexedFasta.IndexedFasta(options.genome_file)

    p = PredictionParser.PredictionParserEntry()

    ninput, noutput = 0, 0
    nfilled = 0
    nseqs_filled = 0
    nseqs_extended = 0
    left_extensions = []
    right_extensions = []
    filled_introns = []

    if not options.skip_header:
        options.stdout.write("\t".join((
            "prediction_id",
            "intron",
            "contig",
            "strand",
            "start",
            "end",
            "length",
            "nstops",
            "type",
            "prime5",
            "prime3",
        )) + "\n")

    for line in sys.stdin:

        if line[0] == "#": continue

        ninput += 1
        p.Read(line)

        lsequence = fasta.getLength(p.mSbjctToken)

        genomic_sequence = fasta.getSequence(p.mSbjctToken, p.mSbjctStrand,
                                             p.mSbjctGenomeFrom,
                                             p.mSbjctGenomeTo).upper()

        exons = Exons.Alignment2Exons(p.mMapPeptide2Genome,
                                      query_from=0,
                                      sbjct_from=0)

        new_exons = []

        last_e = exons[0]

        nintron = 0

        for e in exons[1:]:

            nintron += 1
            lintron = e.mGenomeFrom - last_e.mGenomeTo

            intron_is_l3 = lintron % 3 != 0

            if intron_is_l3:
                ## get sequence, include also residues from split codons
                ## when checking for stop codons.
                ## note that e.mAlignment can sometimes be empty. This might
                ## be an exonerate bug. In the alignment string there are two
                ## consecutive exons.
                if e.mAlignment and last_e.mAlignment and e.mAlignment[0][
                        0] == "S":
                    offset_left = last_e.mAlignment[-1][2]
                    offset_right = e.mAlignment[0][2]
                else:
                    offset_left, offset_right = 0, 0

                sequence = genomic_sequence[last_e.mGenomeTo -
                                            offset_left:e.mGenomeFrom +
                                            offset_right]

                intron_nstops = 0
                for codon in [
                        sequence[x:x + 3] for x in range(0, len(sequence), 3)
                ]:
                    if codon in options.stop_codons:
                        intron_nstops += 1
            else:
                intron_nstops = 0

            ## check for splice signals
            sequence = genomic_sequence[last_e.mGenomeTo:e.mGenomeFrom]

            intron_type, prime5, prime3 = Genomics.GetIntronType(sequence)

            if options.loglevel >= 2:
                options.stdlog.write( "\t".join(map(str, (p.mPredictionId,
                                                          nintron,
                                                          lintron,
                                                          intron_nstops,
                                                          intron_type,
                                                          genomic_sequence[last_e.mGenomeTo-6:last_e.mGenomeTo].lower() + "|" + sequence[:5] + "..." +\
                                                          sequence[-5:] + "|" + genomic_sequence[e.mGenomeFrom:e.mGenomeFrom+6].lower()) ) ) + "\n" )

            options.stdout.write("\t".join(
                map(str, (p.mPredictionId, nintron, p.mSbjctToken,
                          p.mSbjctStrand,
                          last_e.mGenomeTo + p.mSbjctGenomeFrom,
                          e.mGenomeFrom + p.mSbjctGenomeFrom, lintron,
                          intron_nstops, intron_type, prime5, prime3))) + "\n")

            last_e = e

        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write("# ninput=%i, noutput=%i.\n" % (\
            ninput, noutput))

    E.Stop()

Пример #4

Показать файл

Файл: gff2predictions.py Проект: santayana/cgat

                              e.mGenomeFrom, e.mGenomeTo)))

    elif options.output_format == "exons":

        if options.format == "exons":
            parser = PredictionParser.PredictionParserExons(
                contig_sizes=contig_sizes)
        else:
            raise "unknown format %s." % options.format

        results = parser.Parse(sys.stdin.readlines())
        id = 0
        for entry in results:
            exons = Exons.Alignment2Exons(
                entry.mMapPeptide2Genome,
                entry.mQueryFrom,
                entry.mSbjctGenomeFrom,
            )

            for e in exons:
                id += 1
                print "\t".join(
                    map(str,
                        (entry.mQueryToken, entry.mSbjctToken,
                         entry.mSbjctStrand, e.frame, e.mRank, e.mPeptideFrom,
                         e.mPeptideTo, e.mGenomeFrom, e.mGenomeTo)))

    elif options.output_format == "cds":

        if options.format == "exons":

Пример #5

Показать файл

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gpipe/predictions2cds.py 1858 2008-05-13 15:07:05Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome.")

    parser.add_option("-o",
                      "--forward-coordinates",
                      dest="forward_coordinates",
                      action="store_true",
                      help="input uses forward coordinates.")

    parser.add_option("-f",
                      "--format",
                      dest="format",
                      type="choice",
                      choices=("default", "cds", "cdnas", "map", "gff",
                               "intron-fasta", "exons"),
                      help="output format.")

    parser.add_option("-r",
                      "--reset-to-start",
                      dest="reset_to_start",
                      action="store_true",
                      help="move genomic coordinates to begin from 0.")

    parser.add_option("--reset-query",
                      dest="reset_query",
                      action="store_true",
                      help="move peptide coordinates to begin from 0.")

    parser.set_defaults(genome_file=None,
                        forward_coordinates=False,
                        format="default",
                        reset_to_start=False,
                        reset_query=False)

    (options, args) = E.Start(parser, add_pipe_options=True)

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(2)

    cds_id = 1

    entry = PredictionParser.PredictionParserEntry()

    fasta = IndexedFasta.IndexedFasta(options.genome_file)

    ninput, noutput, nskipped, nerrors = 0, 0, 0, 0

    for line in sys.stdin:

        if line[0] == "#":
            continue
        if line.startswith("id"):
            continue

        ninput += 1

        try:
            entry.Read(line)
        except ValueError, msg:
            options.stdlog.write("# parsing failed with msg %s in line %s" %
                                 (msg, line))
            nerrors += 1
            continue

        cds = Exons.Alignment2Exons(entry.mMapPeptide2Genome,
                                    query_from=entry.mQueryFrom,
                                    sbjct_from=entry.mSbjctGenomeFrom,
                                    add_stop_codon=0)

        for cd in cds:
            cd.mSbjctToken = entry.mSbjctToken
            cd.mSbjctStrand = entry.mSbjctStrand

        if cds[-1].mGenomeTo != entry.mSbjctGenomeTo:
            options.stdlog.write(
                "# WARNING: discrepancy in exon calculation!!!\n")
            for cd in cds:
                options.stdlog.write("# %s\n" % str(cd))
            options.stdlog.write("# %s\n" % entry)

        lsequence = fasta.getLength(entry.mSbjctToken)
        genomic_sequence = fasta.getSequence(entry.mSbjctToken,
                                             entry.mSbjctStrand,
                                             entry.mSbjctGenomeFrom,
                                             entry.mSbjctGenomeTo)

        # deal with forward coordinates: convert them to negative strand
        # coordinates
        if options.forward_coordinates and \
                entry.mSbjctStrand == "-":
            entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo = lsequence - \
                entry.mSbjctGenomeTo, lsequence - entry.mSbjctGenomeFrom
            for cd in cds:
                cd.InvertGenomicCoordinates(lsequence)

        # attach sequence to cds
        for cd in cds:
            start = cd.mGenomeFrom - entry.mSbjctGenomeFrom
            end = cd.mGenomeTo - entry.mSbjctGenomeFrom
            cd.mSequence = genomic_sequence[start:end]

        # reset coordinates for query
        if options.reset_to_start:
            offset = entry.mPeptideFrom
            for cd in cds:
                cd.mPeptideFrom -= offset
                cd.mPeptideTo -= offset

        # play with coordinates
        if options.reset_to_start:
            offset = entry.mSbjctGenomeFrom
            for cd in cds:
                cd.mGenomeFrom -= offset
                cd.mGenomeTo -= offset
        else:
            offset = 0

        if options.format == "cds":
            rank = 0
            for cd in cds:
                rank += 1
                cd.mQueryToken = entry.mQueryToken
                cd.mSbjctToken = entry.mSbjctToken
                cd.mSbjctStrand = entry.mSbjctStrand
                cd.mRank = rank
                print str(cd)

        if options.format == "exons":
            rank = 0
            for cd in cds:
                rank += 1
                options.stdout.write("\t".join(
                    map(str, (entry.mPredictionId, cd.mSbjctToken,
                              cd.mSbjctStrand, rank, cd.frame, cd.mPeptideFrom,
                              cd.mPeptideTo, cd.mGenomeFrom, cd.mGenomeTo))) +
                                     "\n")

        elif options.format == "cdnas":
            print string.join(
                map(str,
                    (entry.mPredictionId, entry.mQueryToken, entry.mSbjctToken,
                     entry.mSbjctStrand, entry.mSbjctGenomeFrom - offset,
                     entry.mSbjctGenomeTo - offset, genomic_sequence)), "\t")

        elif options.format == "map":

            map_prediction2genome = alignlib_lite.makeAlignmentSet()

            for cd in cds:
                alignlib_lite.addDiagonal2Alignment(
                    map_prediction2genome, cd.mPeptideFrom + 1, cd.mPeptideTo,
                    (cd.mGenomeFrom - offset) - cd.mPeptideFrom)

            print string.join(
                map(str, (entry.mPredictionId, entry.mSbjctToken,
                          entry.mSbjctStrand,
                          alignlib_lite.AlignmentFormatEmissions(
                              map_prediction2genome))), "\t")

        elif options.format == "intron-fasta":
            rank = 0
            if len(cds) == 1:
                nskipped += 1
                continue

            last = cds[0].mGenomeTo
            for cd in cds[1:]:
                rank += 1
                key = "%s %i %s:%s:%i:%i" % (
                    entry.mPredictionId, rank, entry.mSbjctToken,
                    entry.mSbjctStrand, last, entry.mSbjctGenomeFrom)
                sequence = genomic_sequence[last - entry.mSbjctGenomeFrom:cd.
                                            mGenomeFrom -
                                            entry.mSbjctGenomeFrom]
                options.stdout.write(">%s\n%s\n" % (key, sequence))
                last = cd.mGenomeTo

        elif options.format == "gff-match":
            print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\tTarget \"%s\" %i %i; Score %i; Introns %i; Frameshifts %i; Stops %i" % \
                  (entry.mSbjctToken,
                   "gpipe", "similarity",
                   entry.mSbjctGenomeFrom,
                   entry.mSbjctGenomeTo,
                   entry.mPercentIdentity,
                   entry.mSbjctStrand,
                   ".",
                   entry.mQueryToken,
                   entry.mQueryFrom,
                   entry.mQueryTo,
                   entry.score,
                   entry.mNIntrons,
                   entry.mNFrameShifts,
                   entry.mNStopCodons)

        elif options.format == "gff-exon":
            rank = 0
            for cd in cds:
                rank += 1
                print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\tTarget \"%s\" %i %i; Score %i; Rank %i/%i; Prediction %i" % \
                      (entry.mSbjctToken,
                       "gpipe", "similarity",
                       cd.mGenomeFrom,
                       cd.mGenomeTo,
                       entry.mPercentIdentity,
                       entry.mSbjctStrand,
                       ".",
                       entry.mQueryToken,
                       cd.mPeptideFrom / 3 + 1,
                       cd.mPeptideTo / 3 + 1,
                       entry.score,
                       rank,
                       len(cds),
                       entry.mPredictionId)
        else:
            exon_from = 0
            for cd in cds:
                cd.mPeptideFrom = exon_from
                exon_from += cd.mGenomeTo - cd.mGenomeFrom
                cd.mPeptideTo = exon_from
                print string.join(
                    map(str, (cds_id, entry.mPredictionId, cd.mPeptideFrom,
                              cd.mPeptideTo, cd.frame, cd.mGenomeFrom,
                              cd.mGenomeTo, cd.mSequence)), "\t")
                cds_id += 1

        noutput += 1

Пример #6

Показать файл

Файл: predictions2transcripts.py Проект: yangjl/cgat

def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: gpipe/predictions2transcripts.py 1841 2008-05-08 12:07:13Z andreas $",
                                    usage = globals()["__doc__"] )
    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome."  )

    parser.add_option("-o", "--output-filename-summary", dest="output_filename_summary", type="string",
                      help="filename with summary information."  )

    parser.add_option( "--skip-header", dest="skip_header", action="store_true",
                       help="skip header."  )

    parser.add_option( "--start-codon-boundary", dest="start_codon_boundary", type="int",
                      help="maximum extension for start codon (make divisible by 3)."  )
    
    parser.add_option( "--stop-codon-boundary", dest="stop_codon_boundary", type="int",
                      help="maximum extension for stop codon (make divisible by 3)."  )

    parser.add_option( "--left-extension-mode", dest="left_extension_mode", type="choice",
                       choices=("first-start", "first-stop-backtrack"),
                       help="extension mode for 5' end.")

    parser.add_option( "--fill-introns", dest="fill_introns", type="int",
                      help="fill intron if divisible by three and no stop codon up to a maximum length of #."  )

    parser.add_option( "--introns-max-stops", dest="introns_max_stops", type="int",
                      help="maximum number of stop codons to tolerate within an intron."  )

    parser.add_option( "--output-format", dest="output_format", type="choice",
                       choices=("predictions", "extensions", "filled-introns"),
                      help="output format."  )
    
    parser.set_defaults(
        genome_file = "genome",
        start_codons = ("ATG"),
        stop_codons = ("TAG", "TAA", "TGA"),
        start_codon_boundary = 9999,
        stop_codon_boundary  = 9999,
        fill_introns = 0,
        introns_max_stops = 0,
        left_splice_signals = ("GT",),
        right_splice_signals = ("AG",),
        output_format="extensions",
        left_extension_mode = "first-start",
        skip_header = False,
        output_filename_summary = None,
        )

    (options, args) = E.Start( parser, add_pipe_options = True )

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(2)

    options.start_codon_boundary = int(options.start_codon_boundary / 3)
    options.stop_codon_boundary = int(options.stop_codon_boundary / 3)

    fasta = IndexedFasta.IndexedFasta( options.genome_file )
    
    p = PredictionParser.PredictionParserEntry()

    ninput, noutput = 0, 0
    nfilled = 0
    nseqs_filled = 0
    nseqs_extended = 0
    left_extensions = []
    right_extensions = []
    filled_introns = []

    if not options.skip_header:
        if options.output_format == "predictions":
            options.stdout.write( Prediction.Prediction().getHeader() + "\n" )
        elif options.output_format == "filled-introns":
            options.stdout.write("\t".join( ("prediction_id",
                                             "intron",
                                             "peptide_sequence",
                                             "genomic_sequence") ) + "\n" )

    if options.output_filename_summary:
        outfile_summary = open (options.output_filename_summary, "w" )
        outfile_summary.write( "id\ttype\tnumber\tlength\tfrom\tto\tsequence\n" )
    else:
        outfile_summary = None

    for line in options.stdin:
        
        if line[0] == "#": continue

        ninput += 1
        p.Read(line)

        lsequence = fasta.getLength( p.mSbjctToken )

        genome_from = max( 0, p.mSbjctGenomeFrom - options.start_codon_boundary)
        genome_to = min( lsequence, p.mSbjctGenomeTo + options.stop_codon_boundary)
        
        genomic_sequence = fasta.getSequence( p.mSbjctToken, p.mSbjctStrand,
                                              genome_from,
                                              genome_to ).upper()

        ########################################################################
        ########################################################################
        ########################################################################            
        ## Do extensions
        
        if options.start_codon_boundary or options.stop_codon_boundary:
            
            extension_start = p.mSbjctGenomeFrom - genome_from 
            extension_stop  = genome_to - p.mSbjctGenomeTo
            
            fragment_to = extension_start + p.mSbjctGenomeTo - p.mSbjctGenomeFrom

            lfragment = len(genomic_sequence)

            ########################################################################
            ########################################################################
            ########################################################################            
            ## find start codon
            start = extension_start
            found_start = False
            if options.left_extension_mode == "first-start":

                found_start, start = findCodonReverse( genomic_sequence,
                                                       start,
                                                       options.start_codons,
                                                       options.stop_codons )
                
            elif options.left_extension_mode == "first-stop-backtrack":

                if genomic_sequence[start:start+3] in options.start_codons:
                    found_start = True
                else:
                    found_start, start = findCodonReverse( genomic_sequence,
                                                           start,
                                                           options.stop_codons )
                    
                    if found_start:
                        E.info("prediction %s: stop found at %i (%i) backtracking ..." % ( p.mPredictionId, start, extension_start - start) )
                        
                        ## bracktrack to first start codon
                        found_start = False
                        while start < extension_start:
                            start += 3
                            if genomic_sequence[start:start+3] in options.start_codons:
                                found_start = True
                                break
                        else:
                            start = extension_start

                        if found_start:
                            E.info("start codon found at %i (%i)." % ( start, extension_start - start) )
                        else:
                            E.info("no start codon found." )
                    else:
                        E.info("prediction %s: no stop found ... backtracking to start codon." % ( p.mPredictionId ) )

                        found_start, start = findCodonReverse( genomic_sequence, start, options.start_codons )

                        E.info("prediction %s: no start codon found." % ( p.mPredictionId ) )

            if found_start:
                start += genome_from
            else:
                start = p.mSbjctGenomeFrom

            dstart = p.mSbjctGenomeFrom - start
            
            ########################################################################
            ########################################################################
            ########################################################################            
            ## find stop codon
            ## stop points to the beginning of the codon, thus the stop codon will
            ## not be part of the sequence.
            stop = fragment_to
            found_stop = 0
            while stop < lfragment and \
                      genomic_sequence[stop:stop+3] not in ("NNN", "XXX"):
                if genomic_sequence[stop:stop+3] in options.stop_codons:
                    found_stop = 1
                    break

                stop += 3

            if found_stop:
                stop += genome_from 
            else:
                stop = p.mSbjctGenomeTo

            dstop = stop - p.mSbjctGenomeTo 

            ########################################################################
            ########################################################################
            ########################################################################            
            ## build new prediction
            map_peptide2genome = []
            if dstart: map_peptide2genome.append( ("G", 0, dstart) )
            map_peptide2genome += p.mMapPeptide2Genome
            if dstop: map_peptide2genome.append( ("G", 0, dstop) )

            E.info("prediction %s: extension: found_start=%i, found_stop=%i, left=%i, right=%i" % ( p.mPredictionId, found_start, found_stop, dstart, dstop ) )

            ## save results
            p.mMapPeptide2Genome = map_peptide2genome
            p.mAlignmentString = Genomics.Alignment2String( map_peptide2genome )
            p.mSbjctGenomeFrom -= dstart
            p.mSbjctGenomeTo += dstop
            p.mSbjctFrom += dstart / 3
            p.mSbjctTo += dstart / 3 + dstop / 3            
            
            if dstart or dstop:
                if dstart: left_extensions.append( dstart )
                if dstop: right_extensions.append( dstop )
                
                nseqs_extended += 1

        ## update genomic sequence because borders might have changed.
        genomic_sequence = fasta.getSequence( p.mSbjctToken,
                                              p.mSbjctStrand,
                                              p.mSbjctGenomeFrom,
                                              p.mSbjctGenomeTo ).upper()

        if options.fill_introns:
            
            has_filled = False

            exons = Exons.Alignment2Exons( p.mMapPeptide2Genome,
                                           query_from = 0,
                                           sbjct_from = 0 )

            new_exons = []

            last_e = exons[0]

            nintron = 0

            for e in exons[1:]:

                nintron += 1
                lintron = e.mGenomeFrom - last_e.mGenomeTo
                
                if lintron > options.fill_introns or (lintron) % 3 != 0:
                    E.debug( "prediction %s: intron %i of size %i discarded." % \
                                 (p.mPredictionId,
                                  nintron, lintron ) )
                    
                    new_exons.append(last_e)
                    last_e = e
                    continue

                ## get sequence, include also residues from split codons
                ## when checking for stop codons.
                if e.mAlignment[0][0] == "S":
                    offset_left = last_e.mAlignment[-1][2]
                    offset_right = e.mAlignment[0][2]
                else:
                    offset_left, offset_right = 0, 0
                    
                sequence = genomic_sequence[last_e.mGenomeTo - offset_left:e.mGenomeFrom+offset_right]
                
                ## check for splice sites
                for signal in options.left_splice_signals:
                    if sequence[offset_left:offset_left+len(signal)] == signal:
                        left_signal = True
                        break
                else:
                    left_signal = False
                    
                for signal in options.right_splice_signals:
                    if sequence[-(len(signal)+offset_right):-offset_right] == signal:
                        right_signal = True
                        break
                else:
                    right_signal = False

                nstops, ngaps = 0, 0
                for codon in [ sequence[x:x+3] for x in range(0,len(sequence),3) ]:
                    if codon in options.stop_codons: nstops += 1
                    if "N" in codon.upper(): ngaps += 1
                        
                    E.debug( "prediction %s: intron %i of size %i (%i-%i) (%s:%s:%i:%i): stops=%i, gaps=%i, signals=%s,%s." % \
                                 (p.mPredictionId,
                                  nintron, lintron,
                                  offset_left, offset_right,
                                  p.mSbjctToken, p.mSbjctStrand,
                                  p.mSbjctGenomeFrom + last_e.mGenomeTo,
                                  p.mSbjctGenomeFrom + e.mGenomeFrom,
                                  nstops,
                                  ngaps,
                                  left_signal, right_signal ) )

                if nstops + ngaps > options.introns_max_stops:
                    new_exons.append(last_e)                                        
                    last_e = e
                    continue
                
                E.info( "prediction %s: filling intron %i of size %i: stops=%i, gaps=%i, signals=%s,%s" % \
                            (p.mPredictionId,
                             nintron, lintron,
                             nstops,
                             ngaps,
                             left_signal, right_signal))

                e.Merge( last_e )
                has_filled = True
                nfilled += 1
                last_e = e

                if options.output_format == "filled-introns":
                    options.stdout.write( "\t".join( map(str, ( p.mPredictionId,
                                                                nintron,
                                                                Genomics.TranslateDNA2Protein( sequence ),
                                                                sequence ) ) ) + "\n" )
                                                                
                
                filled_introns.append(lintron)
                p.mNIntrons -= 1
                
            new_exons.append(last_e)

            if has_filled: nseqs_filled += 1

            Exons.UpdatePeptideCoordinates( new_exons )
            
            p.mMapPeptide2Genome = Exons.Exons2Alignment( new_exons )
            p.mAlignmentString = Genomics.Alignment2String( p.mMapPeptide2Genome )

        ## build translated sequence
        p.mMapPeptide2Translation, p.mTranslation = Genomics.Alignment2PeptideAlignment( \
               p.mMapPeptide2Genome, p.mQueryFrom, 0, genomic_sequence )

        ## output info
        if options.output_format == "predictions":
            options.stdout.write( str(p) + "\n" )
        elif options.output_format == "extensions":
            if found_start: found_start = 1
            if found_stop: found_stop = 1
            options.stdout.write( "\t".join( map(str, ( p.mPredictionId,
                                                        found_start, found_stop, 
                                                        dstart, dstop,
                                                        p.mTranslation,
                                                        p.mSbjctGenomeFrom, p.mSbjctGenomeTo,
                                                        p.mAlignmentString ))) + "\n" )

        noutput += 1
        options.stdout.flush()

    E.info("stats  : %s" % "\t".join(Stats.DistributionalParameters().getHeaders() ))
    E.info("left   : %s" % str(Stats.DistributionalParameters(left_extensions)) )
    E.info("right  : %s" % str(Stats.DistributionalParameters(right_extensions)) )
    E.info("introns: %s" % str(Stats.DistributionalParameters(filled_introns)) )        
    E.info("ninput=%i, noutput=%i, nextended=%i, nfilled=%i, nexons_filled=%i" % (\
            ninput, noutput, nseqs_extended, nseqs_filled, nfilled))
        
    E.Stop()

Пример #7

Показать файл

def ResolveExonOverlaps(gene_id, predictions):
    """resolve overlaps between predictions based
    on exonic overlap."""

    all_exons = []
    n = 1

    if len(predictions) == 0:
        return gene_id

    for p in predictions:

        exons = Exons.Alignment2Exons(Genomics.String2Alignment(
            p.mAlignmentString),
                                      query_from=0,
                                      sbjct_from=p.mSbjctGenomeFrom)
        for exon in exons:
            all_exons.append((exon.mGenomeFrom, exon.mGenomeTo, n))
        else:
            all_exons.append((p.mSbjctGenomeFrom, p.mSbjctGenomeTo, n))

        n += 1

    map_prediction2gene = range(0, len(predictions) + 1)
    map_gene2predictions = [None]
    for x in range(1, len(predictions) + 1):
        map_gene2predictions.append([x])

    all_exons.sort()
    # print all_exons

    # cluster exons by overlap
    last_exon_from, last_exon_to, last_p = all_exons[0]

    for exon_from, exon_to, p in all_exons[1:]:
        # if overlap
        overlap = min(exon_to, last_exon_to) - max(exon_from, last_exon_from)

        if overlap and param_exon_identity:
            overlap = (exon_from == last_exon_from) and (exon_to
                                                         == last_exon_to)

        if overlap > 0:
            # print "# overlap between %i and %i" % (p, last_p)
            # rewire pointers to point to gene of previous prediction
            # if they belong to different genes
            new_g = map_prediction2gene[last_p]
            old_g = map_prediction2gene[p]

            if new_g != old_g:
                for x in map_gene2predictions[old_g]:
                    map_gene2predictions[new_g].append(x)
                    map_prediction2gene[x] = new_g
                map_gene2predictions[old_g] = []

        # if no overlap: create new gene, if predictions has no gene
        # associated with it yet.
        else:
            # print "# no overlap between %i and %i" % (p, last_p)
            if not map_prediction2gene[p]:
                map_prediction2gene[p] = len(map_gene2predictions)
                map_gene2predictions.append([p])

        if param_exon_identity:
            last_exon_to = exon_to
            last_exon_from = exon_from
        else:
            last_exon_to = max(last_exon_to, exon_to)

        last_p = p

    for x in range(1, len(map_gene2predictions)):
        if map_gene2predictions[x]:
            for p in map_gene2predictions[x]:
                print "%i\t%i" % (gene_id, predictions[p - 1].mPredictionId)
            gene_id += 1

    return gene_id

Пример #8

Показать файл

        alignlib_lite.ALIGNMENT_LOCAL, param_gop, param_gep)
    map_reference2target = alignlib_lite.makeAlignmentVector()
    assignment_id = 0

    for line in cr.fetchall():

        reference = PredictionParser.PredictionParserEntry()
        reference.FillFromTable(line)

        ct = dbhandle.cursor()
        ct.execute(statement %
                   (param_tablename_predictions_target, reference.mSbjctToken,
                    reference.mSbjctStrand, reference.mSbjctGenomeFrom,
                    reference.mSbjctGenomeTo))

        reference_exons = Exons.Alignment2Exons(reference.mMapPeptide2Genome,
                                                0, reference.mSbjctFrom)

        for line2 in ct.fetchall():
            target = PredictionParser.PredictionParserEntry()
            target.FillFromTable(line2)

            target_exons = Exons.Alignment2Exons(target.mMapPeptide2Genome, 0,
                                                 target.mSbjctFrom)

            ## check for exon overlap
            rr, tt = 0, 0
            overlap = 0
            while rr < len(reference_exons) and tt < len(target_exons):

                r = reference_exons[rr]
                t = target_exons[tt]