示例#1
0
    def _buildAllele(allele_id,
                     transcript,
                     exons,
                     introns,
                     offsets,
                     virtual_coordinates=False,
                     reference_exons=None):
        def _getOffset(pos, offsets):
            x = 0
            while x < len(offsets) and offsets[x][0] <= pos:
                x += 1
            x -= 1
            if x >= 0:
                return offsets[x][1]
            else:
                return 0

        def _sumIndels(ss):
            '''sum indels within ss'''
            c = 0
            for s in ss:
                c += len(s) - 1
            return c

        def _getEndOffsets(ss):
            '''get the offset at exons due to deletions at
            start/end of exon.'''
            l = len(ss)
            x = 0
            while x < l and ss[x] == "":
                x += 1
            start_offset = x

            x = l - 1
            while x >= 0 and ss[x] == "":
                x -= 1
            if x >= 0:
                return start_offset, (l - 1) - x
            else:
                return start_offset, 0

        def _addCds2Reference(map_cds2reference, cds_start, cds_seq,
                              reference_start):
            '''add cds to reference'''
            c, r = cds_start, reference_start
            for x in cds_seq:
                l = len(x)
                if l == 0:
                    r += 1
                else:
                    map_cds2reference.addPair(c, r)
                    c += l
                    r += 1

        # counts
        is_splice_truncated = False
        is_nmd_knockout = False
        is_stop_truncated = False
        nuncorrected_frameshifts = 0
        ncorrected_frameshifts = 0
        nframeshifts = 0
        nsplice_noncanonical = 0
        reference_first_stop_start = -1
        reference_first_stop_end = -1

        # map between the new cds sequence and the reference
        # sequence
        map_cds2reference = alignlib_lite.py_makeAlignmentBlocks()

        ###################################################
        # process first exon
        exon = transcript[0]
        transcript_id = exon.transcript_id

        # collect offset for exon.start
        genome_start = exon.start
        genome_start += _getOffset(genome_start, offsets)
        lcds, cds = 0, []
        cds_starts = [0]

        # still need to deal with deletions of first base:
        exon_starts = [genome_start]
        exon_key = (exon.start, exon.end)
        exon_sequence = exons[exon_key]
        exon_seq = "".join(exon_sequence)

        cds.append(exon_seq)
        _addCds2Reference(map_cds2reference, lcds, exon_sequence, exon.start)
        lcds = len(exon_seq)

        if len(exon_seq) != exon.end - exon.start:
            nframeshifts += 1

        # add first exon to genome position
        genome_pos = genome_start + len(exon_seq)
        last_end = exon.end

        # correct for deletions at start/end of exon
        start_offset, end_offset = _getEndOffsets(exon_sequence)

        # length of original transcript
        loriginal = sum([x.end - x.start for x in transcript])

        if E.global_options.loglevel >= 8:
            print("%i: exon_indels (%i-%i):" %
                  (allele_id, exon.start, exon.end))
            for x, c in enumerate(exons[exon_key]):
                if len(c) != 1:
                    print(x + exon.start, ":%s:" % c)
            print()
            print(exons[exon_key])
            print("genome_pos=", genome_pos,
                  ",exon=%i-%i" % (genome_pos, genome_pos + len(exon_seq)),
                  ", len(exon_seq)=", len(exon_seq), ", len(exon)=",
                  exon.end - exon.start,
                  ", offsets=%i,%i," % (start_offset, end_offset),
                  ", offset at start=", getOffset(exon.start,
                                                  offsets), ", offset at end=",
                  getOffset(exon.end, offsets))

        for exon in transcript[1:]:

            last_exon_sequence = exon_sequence
            last_start_offset, last_end_offset = start_offset, end_offset

            # get the next intron/exon parameters
            exon_key = (exon.start, exon.end)
            exon_sequence = exons[exon_key]
            start_offset, end_offset = _getEndOffsets(exon_sequence)
            intron_key = (last_end, exon.start)

            if last_end == exon.start:
                # catch empty introns
                intron_sequence = []
                intron_key = None
            else:
                intron_sequence = introns[intron_key]

            intron_seq = "".join(intron_sequence)

            ###################################################
            ###################################################
            ###################################################
            # add preceding intron
            new_exon = True

            if len(intron_seq) > frameshiftsize:

                intron_name, intron_seq5, intron_seq3 = Genomics.GetIntronType(
                    intron_seq)
                if intron_name == "unknown":
                    if intron_seq[:2].islower() and intron_seq[-2:].islower():
                        E.debug(
                            "%s: transcript has unknown splice signal - kept because not a variant: %s: %s:%s"
                            % (transcript_id, intron_name, intron_seq5,
                               intron_seq3))
                        nsplice_noncanonical += 1
                    else:
                        is_splice_truncated = True
                        E.debug(
                            "%s: transcript has splice truncated allele: %s: %s:%s"
                            % (transcript_id, intron_name, intron_seq5,
                               intron_seq3))
                        break
                # start a new exon
                cds_starts.append(lcds)

            else:
                # treat as frameshifting intron
                #
                # frame-shifting introns are checked if they are
                # fixed by indels either in the intron itself or
                # the terminal exon sequence. To this end, the effective
                # size of the intron is computed:
                # effective size of intron =
                # indels at terminal x bases at previous exon
                # + size of intron
                # + indels at terminal x bases at next exon
                effective_intron_size = len(intron_seq)
                previous_indels = _sumIndels(
                    last_exon_sequence[max(0, -frameshiftsize):])
                next_indels = _sumIndels(exon_sequence[:frameshiftsize])
                effective_intron_size += previous_indels + next_indels

                if previous_indels + next_indels == 0 and len(
                        intron_seq) % 3 == 0:
                    has_stop = "X" in Genomics.translate(intron_seq.upper(),
                                                         is_seleno=is_seleno)
                else:
                    has_stop = False

                if effective_intron_size % 3 == 0 and not has_stop:
                    E.debug(
                        "%s: fixed frame-shifting intron %i-%i of size %i (size:%i, indels:%i,%i)"
                        % (
                            transcript_id,
                            last_end,
                            exon.start,
                            effective_intron_size,
                            len(intron_seq),
                            previous_indels,
                            next_indels,
                        ))

                    # add to previous exon
                    cds.append(intron_seq)
                    lcds += len(intron_seq)
                    ncorrected_frameshifts += 1
                    new_exon = False
                else:
                    E.debug(
                        "%s: could not fix frame-shifting intron %i-%i of size %i (size:%i, indels:%i,%i, has_stop=%i)"
                        % (transcript_id, last_end, exon.start,
                           effective_intron_size, len(intron_seq),
                           previous_indels, next_indels, has_stop))

                    nuncorrected_frameshifts += 1
                    # start a new exon
                    cds_starts.append(lcds)

            if E.global_options.loglevel >= 8:
                print("%i: intron_indels (%i-%i):" %
                      (allele_id, last_end, exon.start))
                if intron_key:
                    for x, c in enumerate(introns[intron_key]):
                        if len(c) != 1:
                            print(x + last_end, ":%s:" % c)
                    print()
                    print(introns[intron_key])
                    print(
                        "genome_pos=", genome_pos, ",intron=%i-%i" %
                        (genome_pos, genome_pos + len(intron_seq)),
                        ", len(intron_seq)=", len(intron_seq),
                        ", len(intron)=",
                        exon.start - last_end, ", offset at start=",
                        _getOffset(last_end, offsets), ", offset at end=",
                        _getOffset(exon.start, offsets))
                else:
                    print("empty intron")

            genome_pos += len(intron_seq)

            # assertion - check if genomic coordinate of intron is consistent
            # with offset
            test_offset = _getOffset(exon.start, offsets)
            is_offset = genome_pos - exon.start
            assert is_offset == test_offset, "intron offset difference: %i != %i" % (
                is_offset, test_offset)

            ###################################################
            ###################################################
            ###################################################
            # add the exon
            exon_seq = "".join(exon_sequence)
            cds.append(exon_seq)

            if len(exon_seq) != exon.end - exon.start:
                nframeshifts += 1

            if new_exon:
                if reference_coordinates:
                    exon_starts.append(exon.start + start_offset)
                else:
                    exon_starts.append(genome_pos)

            _addCds2Reference(map_cds2reference, lcds, exon_sequence,
                              exon.start)

            lcds += len(exon_seq)
            last_end = exon.end

            if E.global_options.loglevel >= 8:
                print("%i: exon_indels (%i-%i):" %
                      (allele_id, exon.start, exon.end))
                for x, c in enumerate(exons[exon_key]):
                    if len(c) != 1:
                        print(x + exon.start, ":%s:" % c)
                print()
                print(exons[exon_key])
                print("genome_pos=", genome_pos,
                      ",exon=%i-%i" % (genome_pos, genome_pos + len(exon_seq)),
                      ", len(exon_seq)=", len(exon_seq), ", len(exon)=",
                      exon.end - exon.start, ", offsets=%i,%i," %
                      (start_offset, end_offset), ", offset at start=",
                      getOffset(exon.start, offsets), ", offset at end=",
                      getOffset(exon.end, offsets))

            genome_pos += len(exon_seq)

            test_offset = _getOffset(exon.end, offsets)
            is_offset = genome_pos - exon.end
            assert is_offset == test_offset, "exon offset difference: %i != %i" % (
                is_offset, test_offset)

        cds = "".join(cds)
        assert lcds == len(cds)

        # fix incomplete codons at the end of the sequence
        if lcds % 3 != 0:
            offset = lcds % 3
            cds = cds[:-offset]

        # add frame correction for transcripts that do not start at frame=0
        start_frame = (3 - (int(transcript[0].frame) % 3)) % 3

        # n are ignored (? in sequence to deal with genes like Muc2)
        peptide = Genomics.translate("n" * start_frame + cds,
                                     is_seleno=is_seleno,
                                     prefer_lowercase=False,
                                     ignore_n=True)

        # find the first stop codon
        if start_frame != 0:
            # ignore first, potentially incomplete base
            pep_first_stop = peptide.upper().find("X", 1)
        else:
            pep_first_stop = peptide.upper().find("X")

        E.debug("%s: translated peptide = %s, first stop at %i" %
                (transcript_id, peptide, pep_first_stop))

        peptide = peptide.replace("?", "x")

        if E.global_options.loglevel >= 8:
            E.debug("peptide=%s" % peptide)
            E.debug("cds=%s" % cds)

        E.debug("%s: start_frame=%i, first stop at %i/%i" %
                (transcript_id, start_frame, pep_first_stop, len(peptide)))

        lpeptide, lcds = len(peptide), len(cds)

        # check for non-sense mediated decay
        if pep_first_stop != -1:
            cds_first_stop = pep_first_stop * 3 - start_frame
            if cds_first_stop < cds_starts[-1]:
                if ncorrected_frameshifts or nuncorrected_frameshifts:
                    E.warn(
                        "nmd knockout transcript %s has frameshifts: %i corrected, %i uncorrected"
                        % (transcript_id, ncorrected_frameshifts,
                           nuncorrected_frameshifts))
                is_nmd_knockout = True
                cds = peptide = ""
                lpeptide, lcds = 0, 0
                reference_first_stop_start, reference_first_stop_end = \
                    (map_cds2reference.mapRowToCol(cds_first_stop),
                     map_cds2reference.mapRowToCol(cds_first_stop + 3))
            elif pep_first_stop < len(peptide) - 1:
                is_stop_truncated = True
                cds = cds[:cds_first_stop]
                peptide[:pep_first_stop]
                lpeptide, lcds = len(peptide), len(cds)
                reference_first_stop_start, reference_first_stop_end = \
                    (map_cds2reference.mapRowToCol(cds_first_stop),
                     map_cds2reference.mapRowToCol(cds_first_stop + 3))
            else:
                E.warn(
                    "first stop at %i(cds=%i) ignored: last exon start at %i" %
                    (pep_first_stop, cds_first_stop, cds_starts[-1]))

        else:
            # -1 for no stop codon found
            pep_first_stop = -1
            cds_first_stop = -1
            lpeptide, lcds = len(peptide), len(cds)

        if peptide is None and nframeshifts == 0:
            E.warn(
                "transcript %s is knockout, though there are no indels - must be nonsense mutation"
                % (transcript_id))

        # build frames
        frames = [start_frame]
        start = start_frame
        l = 0
        for end in cds_starts[1:]:
            l += end - start
            frames.append((3 - l % 3) % 3)
            start = end

        return Allele._make((
            cds,
            peptide,
            len(cds_starts),
            cds_starts,
            exon_starts,
            frames,
            is_nmd_knockout,
            is_splice_truncated,
            is_stop_truncated,
            nframeshifts,
            ncorrected_frameshifts,
            nuncorrected_frameshifts,
            pep_first_stop,
            lpeptide,
            cds_first_stop,
            lcds,
            reference_first_stop_start,
            reference_first_stop_end,
            loriginal,
            nsplice_noncanonical,
        )), map_cds2reference
示例#2
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gpipe/predictions2introns.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome.")

    parser.add_option("-o",
                      "--output-filename-summary",
                      dest="output_filename_summary",
                      type="string",
                      help="filename with summary information.")

    parser.add_option("--skip-header",
                      dest="skip_header",
                      action="store_true",
                      help="skip header.")

    parser.add_option(
        "--fill-introns",
        dest="fill_introns",
        type="int",
        help=
        "fill intron if divisible by three and no stop codon up to a maximum length of #."
    )

    parser.add_option(
        "--introns-max-stops",
        dest="introns_max_stops",
        type="int",
        help="maximum number of stop codons to tolerate within an intron.")

    parser.add_option("--output-format",
                      dest="output_format",
                      type="choice",
                      choices=("predictions", "extensions", "filled-introns"),
                      help="output format.")

    parser.set_defaults(
        genome_file="genome",
        start_codons=("ATG"),
        stop_codons=("TAG", "TAA", "TGA"),
        skip_header=False,
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(2)

    fasta = IndexedFasta.IndexedFasta(options.genome_file)

    p = PredictionParser.PredictionParserEntry()

    ninput, noutput = 0, 0
    nfilled = 0
    nseqs_filled = 0
    nseqs_extended = 0
    left_extensions = []
    right_extensions = []
    filled_introns = []

    if not options.skip_header:
        options.stdout.write("\t".join((
            "prediction_id",
            "intron",
            "contig",
            "strand",
            "start",
            "end",
            "length",
            "nstops",
            "type",
            "prime5",
            "prime3",
        )) + "\n")

    for line in sys.stdin:

        if line[0] == "#": continue

        ninput += 1
        p.Read(line)

        lsequence = fasta.getLength(p.mSbjctToken)

        genomic_sequence = fasta.getSequence(p.mSbjctToken, p.mSbjctStrand,
                                             p.mSbjctGenomeFrom,
                                             p.mSbjctGenomeTo).upper()

        exons = Exons.Alignment2Exons(p.mMapPeptide2Genome,
                                      query_from=0,
                                      sbjct_from=0)

        new_exons = []

        last_e = exons[0]

        nintron = 0

        for e in exons[1:]:

            nintron += 1
            lintron = e.mGenomeFrom - last_e.mGenomeTo

            intron_is_l3 = lintron % 3 != 0

            if intron_is_l3:
                ## get sequence, include also residues from split codons
                ## when checking for stop codons.
                ## note that e.mAlignment can sometimes be empty. This might
                ## be an exonerate bug. In the alignment string there are two
                ## consecutive exons.
                if e.mAlignment and last_e.mAlignment and e.mAlignment[0][
                        0] == "S":
                    offset_left = last_e.mAlignment[-1][2]
                    offset_right = e.mAlignment[0][2]
                else:
                    offset_left, offset_right = 0, 0

                sequence = genomic_sequence[last_e.mGenomeTo -
                                            offset_left:e.mGenomeFrom +
                                            offset_right]

                intron_nstops = 0
                for codon in [
                        sequence[x:x + 3] for x in range(0, len(sequence), 3)
                ]:
                    if codon in options.stop_codons:
                        intron_nstops += 1
            else:
                intron_nstops = 0

            ## check for splice signals
            sequence = genomic_sequence[last_e.mGenomeTo:e.mGenomeFrom]

            intron_type, prime5, prime3 = Genomics.GetIntronType(sequence)

            if options.loglevel >= 2:
                options.stdlog.write( "\t".join(map(str, (p.mPredictionId,
                                                          nintron,
                                                          lintron,
                                                          intron_nstops,
                                                          intron_type,
                                                          genomic_sequence[last_e.mGenomeTo-6:last_e.mGenomeTo].lower() + "|" + sequence[:5] + "..." +\
                                                          sequence[-5:] + "|" + genomic_sequence[e.mGenomeFrom:e.mGenomeFrom+6].lower()) ) ) + "\n" )

            options.stdout.write("\t".join(
                map(str, (p.mPredictionId, nintron, p.mSbjctToken,
                          p.mSbjctStrand,
                          last_e.mGenomeTo + p.mSbjctGenomeFrom,
                          e.mGenomeFrom + p.mSbjctGenomeFrom, lintron,
                          intron_nstops, intron_type, prime5, prime3))) + "\n")

            last_e = e

        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write("# ninput=%i, noutput=%i.\n" % (\
            ninput, noutput))

    E.Stop()