示例#1
0
def __verify_boundaries(transcript):
    """
    Method to verify that the start/end of the transcripts are exactly where they should.
    Called from finalise.
    :return:
    """

    try:
        if transcript.exons[0][0] != transcript.start or transcript.exons[-1][
                1] != transcript.end:
            transcript.logger.warning(
                """The transcript {id} has coordinates {tstart}:{tend},
            but its first and last exons define it up until {estart}:{eend}!
            Exons: {exons}. Shrinking it""".format(
                    id=transcript.id,
                    tstart=transcript.start,
                    tend=transcript.end,
                    estart=transcript.exons[0][0],
                    eend=transcript.exons[-1][1],
                    exons=transcript.exons))
            transcript.start = transcript.exons[0][0]
            transcript.end = transcript.exons[-1][1]

    except IndexError as err:
        raise InvalidTranscript(err, transcript.id, str(transcript.exons))
示例#2
0
def __check_collisions(transcript, nspan, spans):
    """
    This method checks whether a new transcript collides with a previously
    defined transcript.
    :param nspan:
    :param spans:
    :return:
    """

    if len(spans) == 0:
        return
    for span in spans:
        overl = overlap(span, nspan)

        transcript.logger.debug(
            "Comparing start-ends for split of %s. SpanA: %s SpanB: %s Overlap: %d",
            transcript.id, span, nspan, overl)

        if overl > 0:
            err_message = "Invalid overlap for {0}! T1: {1}. T2: {2}".format(
                transcript.id, span, nspan)
            transcript.logger.error(err_message)
            raise InvalidTranscript(err_message)
示例#3
0
def __calculate_introns(transcript):
    """Private method to create the stores of intron
    and splice sites positions.
    """

    introns = []
    cds_introns = []
    splices = []

    if len(transcript.exons) > 1:
        for index in range(len(transcript.exons) - 1):
            exona, exonb = transcript.exons[index:index + 2]
            if exona[1] >= exonb[0]:
                exc = InvalidTranscript(
                    "Overlapping exons found for\n{0} {1}/{2}\n{3}".format(
                        transcript.id, exona, exonb, transcript.exons))
                transcript.logger.debug(exc)
                raise exc
            # Append the splice junction
            introns.append(tuple([exona[1] + 1, exonb[0] - 1]))
            # Append the splice locations
            splices.extend([exona[1] + 1, exonb[0] - 1])
    transcript.introns = set(introns)
    transcript.splices = set(splices)

    if (transcript.number_internal_orfs == 0
            or len(transcript.selected_cds) < 2
            or len(transcript.combined_cds) < 2):
        pass
    else:
        # Start calculating the selected CDS introns
        for first, second in zip(transcript.selected_cds[:-1],
                                 transcript.selected_cds[1:]):
            assert first != second, (transcript.id, transcript.selected_cds)
            # assert first[1] < second[0], (first, second)
            first, second = sorted([first, second])
            intron = tuple([first[1] + 1, second[0] - 1])
            if intron not in transcript.introns:
                continue
            # assert intron in transcript.introns, (transcript.id, intron, first, second)
            cds_introns.append(intron)

        cintrons = set(cds_introns)
        assert len(cintrons) > 0
        transcript._selected_cds_introns = cintrons

        if transcript.number_internal_orfs > 1:
            cds_introns = []
            for position in range(len(transcript.combined_cds) - 1):
                former = transcript.combined_cds[position]
                latter = transcript.combined_cds[position + 1]
                junc = tuple([former[1] + 1, latter[0] - 1])
                if junc in transcript.introns:
                    cds_introns.append(junc)
            cintrons = set(cds_introns)
            assert len(cintrons) > 0
            transcript._combined_cds_introns = cintrons
        else:
            transcript._combined_cds_introns = transcript._selected_cds_introns.copy(
            )

    assert len(transcript._combined_cds_introns) >= len(
        transcript._selected_cds_introns)
    return transcript
示例#4
0
def __basic_final_checks(transcript):
    """
    Function that verifies minimal criteria of a transcript before finalising.
    :type transcript: Mikado.loci_objects.transcript.Transcript

    :return:
    """

    _exons = transcript.exons

    if not _exons:
        if transcript._possibly_without_exons is True:
            transcript.logger.debug(
                "Inferring that %s is a single-exon transcript")
            new_exon = (transcript.start, transcript.end)
            transcript.add_exon(new_exon)

        elif len(transcript.combined_cds) == 0:
            exc = InvalidTranscript(
                "No exon defined for the transcript {0}. Aborting".format(
                    transcript.id))
            transcript.logger.exception(exc)
            raise exc
        else:
            # Let us try to derive exons from CDS ...
            _exons = sorted([
                tuple([int(exon[0]), int(exon[1])])
                for exon in transcript.combined_cds
            ])
            if len(transcript.combined_utr) == 0:
                # Enlarge the terminal exons to include the starts
                if transcript.start is not None:
                    _exons[0] = (transcript.start, _exons[0][1])
                if transcript.end is not None:
                    _exons[-1] = (_exons[-1][0], transcript.end)
            else:
                __utr = sorted([
                    tuple([int(exon[0]), int(exon[1])])
                    for exon in transcript.combined_utr
                ])
                try:
                    __before = [_ for _ in __utr if _[1] < _exons[0][0]]
                    if __before and __before[-1][1] == _exons[0][0] - 1:
                        _exons[0] = (__before[-1][0], _exons[0][1])
                        __before.pop()
                    __after = [_ for _ in __utr if _[0] > _exons[-1][1]]
                    if __after and __after[0][0] == _exons[-1][1] + 1:
                        _exons[-1] = (_exons[-1][0], __after[0][1])
                        __after = __after[1:]
                    _exons = __before + _exons + __after
                except IndexError:
                    exc = InvalidTranscript(
                        "Transcript {} has a mangled CDS/UTR annotation. Please revise it."
                    )
                    transcript.logger.exception(exc)
                    raise exc

    transcript.logger.debug("Converting to tuples")
    _exons = [tuple([int(exon[0]), int(exon[1])]) for exon in _exons]

    new_exons = []
    # invalid = False

    # Set the start and end automatically if none has been explicitly provided
    if transcript.start is None:
        transcript.start = min(_[0] for _ in _exons)
    if transcript.end is None:
        transcript.end = max(_[1] for _ in _exons)

    for exon in _exons:
        if not isinstance(exon, tuple):
            if (isinstance(exon, Interval) or
                (isinstance(exon, list) and len(exon) == 2
                 and isinstance(exon[0], int) and isinstance(exon[1], int))):
                exon = tuple([exon])
            else:
                raise ValueError("Invalid exon: {0}, type {1}".format(
                    exon, type(exon)))
        if exon[0] < transcript.start or exon[1] > transcript.end:
            exc = InvalidTranscript(
                "{} for {} is an invalid exon (start {}, end {})".format(
                    exon, transcript.id, transcript.start, transcript.end))
            transcript.logger.debug(exc)
            raise exc
        new_exons.append(exon)

    transcript._set_exons(sorted(new_exons))

    if len(transcript.exons) > 1 and transcript.strand is None:
        if transcript._accept_undefined_multi is False:

            exc = InvalidTranscript(
                "Multiexonic transcripts must have a defined strand! Error for {0}"
                .format(transcript.id))
            transcript.logger.exception(exc)
            raise exc
        else:
            transcript.strand = "?"

    if transcript.combined_utr != [] and transcript.combined_cds == []:

        exc = InvalidTranscript(
            "Transcript {tid} has defined UTRs but no CDS feature!".format(
                tid=transcript.id))
        transcript.logger.exception(exc)
        raise exc
示例#5
0
def split_by_cds(transcript):
    """This method is used for transcripts that have multiple ORFs.
    It will split them according to the CDS information into multiple transcripts.
    UTR information will be retained only if no ORF is down/upstream.

    :param transcript: the transcript instance
    :type transcript: Mikado.loci_objects.transcript.Transcript
    """

    transcript.finalize()

    # List of the transcript that will be retained

    if transcript.number_internal_orfs < 2:
        new_transcripts = [transcript]  # If we only have one ORF this is easy
    elif (transcript.json_conf and transcript.json_conf.get("pick", None)
          and transcript.json_conf["pick"].get("chimera_split", None)
          and transcript.json_conf["pick"]["chimera_split"].get("skip", None)
          and transcript.source
          in transcript.json_conf["pick"]["chimera_split"]["skip"]):
        # Disable splitting for transcripts with certain tags
        transcript.logger.warning("%s (label %s) to be skipped for splitting",
                                  transcript.id,
                                  transcript.id.split("_")[0])
        new_transcripts = [transcript]
    else:
        cds_boundaries = SortedDict()
        for orf in sorted(transcript.loaded_bed12,
                          key=operator.attrgetter("thick_start", "thick_end")):
            cds_boundaries[(orf.thick_start, orf.thick_end)] = [orf]

        # Check whether we have to split or not based on BLAST data
        if transcript.json_conf is not None:
            if transcript.json_conf["pick"]["chimera_split"][
                    "blast_check"] is True:
                cds_boundaries = check_split_by_blast(transcript,
                                                      cds_boundaries)

        if len(cds_boundaries) == 1:
            # Recheck how many boundaries we have - after the BLAST check
            # we might have determined that the transcript has not to be split
            new_transcripts = [transcript]
        else:
            try:
                new_transcripts = __create_splitted_transcripts(
                    transcript, cds_boundaries)
            except InvalidTranscript as err:
                exc = InvalidTranscript(err)
                transcript.logger.error("Error in splitting %s by ORF",
                                        transcript.id)
                transcript.logger.exception(exc)
                transcript.logger.error(
                    "ORFs: %s",
                    "\n".join([str(_) for _ in transcript.internal_orfs]))
                transcript.logger.error(
                    "BED12: %s",
                    "\n".join([str(_) for _ in transcript.loaded_bed12]))
                transcript.logger.error("Stripping %s of its CDS.",
                                        transcript.id)
                transcript.strip_cds()
                new_transcripts = [transcript]

    assert len(new_transcripts) > 0, str(transcript)
    __original = set()
    for internal in transcript.internal_orfs:
        __original.add(tuple([_[1] for _ in internal if _[0] == "CDS"]))

    for new_transc in new_transcripts:
        for internal in new_transc.internal_orfs:
            internal = tuple([_[1] for _ in internal if _[0] == "CDS"])
            assert internal in __original, (transcript.id, __original,
                                            internal)

        new_transc.verified_introns = set.intersection(
            set(new_transc.introns), transcript.verified_introns)
        yield new_transc

    return
示例#6
0
def __create_splitted_transcripts(transcript, cds_boundaries):
    """
    Private method called by split_by_cds to create the various (N>1) transcripts
    that are its output.
    :param cds_boundaries: a list of int tuples, containing the boundaries
     of the new transcripts.
    :return:
    """

    spans = []
    new_transcripts = []

    for counter, (boundary, bed12_objects) in enumerate(
            sorted(cds_boundaries.items(), key=operator.itemgetter(0))):
        new_transcript = transcript.__class__()
        new_transcript.feature = "mRNA"
        for attribute in ["chrom", "source", "score", "strand", "attributes"]:
            setattr(new_transcript, attribute, getattr(transcript, attribute))
        # Determine which ORFs I have on my right and left
        new_transcript.parent = transcript.parent
        left = True
        right = True
        if counter == 0:  # leftmost
            left = False
        if 1 + counter == len(cds_boundaries):  # rightmost
            right = False
        counter += 1  # Otherwise they start from 0
        new_transcript.id = "{0}.split{1}".format(transcript.id, counter)
        new_transcript.logger = transcript.logger
        bed12_strand = set(_.strand for _ in bed12_objects)
        assert len(bed12_strand) == 1
        bed12_strand = bed12_strand.pop()

        transcript.logger.debug("Splitting exons for %s", new_transcript.id)
        my_exons, discarded_exons, tstart, tend = __create_splitted_exons(
            transcript, boundary, left, right, bed12_strand)

        transcript.logger.debug(
            """TID %s counter %d, boundary %s, left %s right %s""",
            transcript.id, counter, boundary, left, right)

        if right is True:
            transcript.logger.debug("TID %s TEND %d Boun[1] %s", transcript.id,
                                    tend, boundary[1])
        if left is True:
            transcript.logger.debug("TID %s TSTART %d Boun[0] %s",
                                    transcript.id, tstart, boundary[0])

        assert len(my_exons) > 0, (discarded_exons, boundary)

        new_transcript.exons = my_exons

        new_transcript.start = min(exon[0] for exon in new_transcript.exons)
        new_transcript.end = max(exon[1] for exon in new_transcript.exons)
        assert new_transcript.end <= transcript.end
        assert new_transcript.start >= transcript.start
        assert new_transcript.is_coding is False
        new_transcript.json_conf = transcript.json_conf
        # Now we have to modify the BED12s to reflect
        # the fact that we are starting/ending earlier
        new_transcript.finalize()
        # if transcript.monoexonic is True:
        # if new_transcript.monoexonic is True:
        #     new_transcript.strand = None

        transcript.logger.debug(
            "Relocating %d ORFs into the new transcript (%d, %d), \
tcoordinates (%d, %d)", len(bed12_objects), new_transcript.start,
            new_transcript.end, tstart, tend)
        new_bed12s = __relocate_orfs(transcript, bed12_objects, tstart, tend)
        assert len([_ for _ in new_bed12s if _.strand == "+"]) > 0
        transcript.logger.debug(
            "Loading %d ORFs into the new transcript (%d, %d): %s",
            len(new_bed12s), new_transcript.start, new_transcript.end,
            "\n\t" + "\n".join([str(_) for _ in new_bed12s]))
        new_transcript.logger = transcript.logger
        new_transcript.load_orfs(new_bed12s)

        if new_transcript.selected_cds_length <= 0:
            err_message = "No CDS information retained for {0} split {1}\n".format(
                transcript.id, counter)
            err_message += "BED: {0}".format("\n\t".join(
                [str(x) for x in new_bed12s]))
            raise InvalidTranscript(err_message)

        # Load the blast hits
        __load_blast_hits(new_transcript, boundary, transcript)
        new_transcript.finalize()
        new_transcripts.append(new_transcript)
        nspan = (new_transcript.start, new_transcript.end)
        transcript.logger.debug(
            "Transcript {0} split {1}, discarded exons: {2}".format(
                transcript.id, counter, discarded_exons))
        __check_collisions(transcript, nspan, spans)
        spans.append([new_transcript.start, new_transcript.end])

    return new_transcripts