def __verify_boundaries(transcript): """ Method to verify that the start/end of the transcripts are exactly where they should. Called from finalise. :return: """ try: if transcript.exons[0][0] != transcript.start or transcript.exons[-1][ 1] != transcript.end: transcript.logger.warning( """The transcript {id} has coordinates {tstart}:{tend}, but its first and last exons define it up until {estart}:{eend}! Exons: {exons}. Shrinking it""".format( id=transcript.id, tstart=transcript.start, tend=transcript.end, estart=transcript.exons[0][0], eend=transcript.exons[-1][1], exons=transcript.exons)) transcript.start = transcript.exons[0][0] transcript.end = transcript.exons[-1][1] except IndexError as err: raise InvalidTranscript(err, transcript.id, str(transcript.exons))
def __check_collisions(transcript, nspan, spans): """ This method checks whether a new transcript collides with a previously defined transcript. :param nspan: :param spans: :return: """ if len(spans) == 0: return for span in spans: overl = overlap(span, nspan) transcript.logger.debug( "Comparing start-ends for split of %s. SpanA: %s SpanB: %s Overlap: %d", transcript.id, span, nspan, overl) if overl > 0: err_message = "Invalid overlap for {0}! T1: {1}. T2: {2}".format( transcript.id, span, nspan) transcript.logger.error(err_message) raise InvalidTranscript(err_message)
def __calculate_introns(transcript): """Private method to create the stores of intron and splice sites positions. """ introns = [] cds_introns = [] splices = [] if len(transcript.exons) > 1: for index in range(len(transcript.exons) - 1): exona, exonb = transcript.exons[index:index + 2] if exona[1] >= exonb[0]: exc = InvalidTranscript( "Overlapping exons found for\n{0} {1}/{2}\n{3}".format( transcript.id, exona, exonb, transcript.exons)) transcript.logger.debug(exc) raise exc # Append the splice junction introns.append(tuple([exona[1] + 1, exonb[0] - 1])) # Append the splice locations splices.extend([exona[1] + 1, exonb[0] - 1]) transcript.introns = set(introns) transcript.splices = set(splices) if (transcript.number_internal_orfs == 0 or len(transcript.selected_cds) < 2 or len(transcript.combined_cds) < 2): pass else: # Start calculating the selected CDS introns for first, second in zip(transcript.selected_cds[:-1], transcript.selected_cds[1:]): assert first != second, (transcript.id, transcript.selected_cds) # assert first[1] < second[0], (first, second) first, second = sorted([first, second]) intron = tuple([first[1] + 1, second[0] - 1]) if intron not in transcript.introns: continue # assert intron in transcript.introns, (transcript.id, intron, first, second) cds_introns.append(intron) cintrons = set(cds_introns) assert len(cintrons) > 0 transcript._selected_cds_introns = cintrons if transcript.number_internal_orfs > 1: cds_introns = [] for position in range(len(transcript.combined_cds) - 1): former = transcript.combined_cds[position] latter = transcript.combined_cds[position + 1] junc = tuple([former[1] + 1, latter[0] - 1]) if junc in transcript.introns: cds_introns.append(junc) cintrons = set(cds_introns) assert len(cintrons) > 0 transcript._combined_cds_introns = cintrons else: transcript._combined_cds_introns = transcript._selected_cds_introns.copy( ) assert len(transcript._combined_cds_introns) >= len( transcript._selected_cds_introns) return transcript
def __basic_final_checks(transcript): """ Function that verifies minimal criteria of a transcript before finalising. :type transcript: Mikado.loci_objects.transcript.Transcript :return: """ _exons = transcript.exons if not _exons: if transcript._possibly_without_exons is True: transcript.logger.debug( "Inferring that %s is a single-exon transcript") new_exon = (transcript.start, transcript.end) transcript.add_exon(new_exon) elif len(transcript.combined_cds) == 0: exc = InvalidTranscript( "No exon defined for the transcript {0}. Aborting".format( transcript.id)) transcript.logger.exception(exc) raise exc else: # Let us try to derive exons from CDS ... _exons = sorted([ tuple([int(exon[0]), int(exon[1])]) for exon in transcript.combined_cds ]) if len(transcript.combined_utr) == 0: # Enlarge the terminal exons to include the starts if transcript.start is not None: _exons[0] = (transcript.start, _exons[0][1]) if transcript.end is not None: _exons[-1] = (_exons[-1][0], transcript.end) else: __utr = sorted([ tuple([int(exon[0]), int(exon[1])]) for exon in transcript.combined_utr ]) try: __before = [_ for _ in __utr if _[1] < _exons[0][0]] if __before and __before[-1][1] == _exons[0][0] - 1: _exons[0] = (__before[-1][0], _exons[0][1]) __before.pop() __after = [_ for _ in __utr if _[0] > _exons[-1][1]] if __after and __after[0][0] == _exons[-1][1] + 1: _exons[-1] = (_exons[-1][0], __after[0][1]) __after = __after[1:] _exons = __before + _exons + __after except IndexError: exc = InvalidTranscript( "Transcript {} has a mangled CDS/UTR annotation. Please revise it." ) transcript.logger.exception(exc) raise exc transcript.logger.debug("Converting to tuples") _exons = [tuple([int(exon[0]), int(exon[1])]) for exon in _exons] new_exons = [] # invalid = False # Set the start and end automatically if none has been explicitly provided if transcript.start is None: transcript.start = min(_[0] for _ in _exons) if transcript.end is None: transcript.end = max(_[1] for _ in _exons) for exon in _exons: if not isinstance(exon, tuple): if (isinstance(exon, Interval) or (isinstance(exon, list) and len(exon) == 2 and isinstance(exon[0], int) and isinstance(exon[1], int))): exon = tuple([exon]) else: raise ValueError("Invalid exon: {0}, type {1}".format( exon, type(exon))) if exon[0] < transcript.start or exon[1] > transcript.end: exc = InvalidTranscript( "{} for {} is an invalid exon (start {}, end {})".format( exon, transcript.id, transcript.start, transcript.end)) transcript.logger.debug(exc) raise exc new_exons.append(exon) transcript._set_exons(sorted(new_exons)) if len(transcript.exons) > 1 and transcript.strand is None: if transcript._accept_undefined_multi is False: exc = InvalidTranscript( "Multiexonic transcripts must have a defined strand! Error for {0}" .format(transcript.id)) transcript.logger.exception(exc) raise exc else: transcript.strand = "?" if transcript.combined_utr != [] and transcript.combined_cds == []: exc = InvalidTranscript( "Transcript {tid} has defined UTRs but no CDS feature!".format( tid=transcript.id)) transcript.logger.exception(exc) raise exc
def split_by_cds(transcript): """This method is used for transcripts that have multiple ORFs. It will split them according to the CDS information into multiple transcripts. UTR information will be retained only if no ORF is down/upstream. :param transcript: the transcript instance :type transcript: Mikado.loci_objects.transcript.Transcript """ transcript.finalize() # List of the transcript that will be retained if transcript.number_internal_orfs < 2: new_transcripts = [transcript] # If we only have one ORF this is easy elif (transcript.json_conf and transcript.json_conf.get("pick", None) and transcript.json_conf["pick"].get("chimera_split", None) and transcript.json_conf["pick"]["chimera_split"].get("skip", None) and transcript.source in transcript.json_conf["pick"]["chimera_split"]["skip"]): # Disable splitting for transcripts with certain tags transcript.logger.warning("%s (label %s) to be skipped for splitting", transcript.id, transcript.id.split("_")[0]) new_transcripts = [transcript] else: cds_boundaries = SortedDict() for orf in sorted(transcript.loaded_bed12, key=operator.attrgetter("thick_start", "thick_end")): cds_boundaries[(orf.thick_start, orf.thick_end)] = [orf] # Check whether we have to split or not based on BLAST data if transcript.json_conf is not None: if transcript.json_conf["pick"]["chimera_split"][ "blast_check"] is True: cds_boundaries = check_split_by_blast(transcript, cds_boundaries) if len(cds_boundaries) == 1: # Recheck how many boundaries we have - after the BLAST check # we might have determined that the transcript has not to be split new_transcripts = [transcript] else: try: new_transcripts = __create_splitted_transcripts( transcript, cds_boundaries) except InvalidTranscript as err: exc = InvalidTranscript(err) transcript.logger.error("Error in splitting %s by ORF", transcript.id) transcript.logger.exception(exc) transcript.logger.error( "ORFs: %s", "\n".join([str(_) for _ in transcript.internal_orfs])) transcript.logger.error( "BED12: %s", "\n".join([str(_) for _ in transcript.loaded_bed12])) transcript.logger.error("Stripping %s of its CDS.", transcript.id) transcript.strip_cds() new_transcripts = [transcript] assert len(new_transcripts) > 0, str(transcript) __original = set() for internal in transcript.internal_orfs: __original.add(tuple([_[1] for _ in internal if _[0] == "CDS"])) for new_transc in new_transcripts: for internal in new_transc.internal_orfs: internal = tuple([_[1] for _ in internal if _[0] == "CDS"]) assert internal in __original, (transcript.id, __original, internal) new_transc.verified_introns = set.intersection( set(new_transc.introns), transcript.verified_introns) yield new_transc return
def __create_splitted_transcripts(transcript, cds_boundaries): """ Private method called by split_by_cds to create the various (N>1) transcripts that are its output. :param cds_boundaries: a list of int tuples, containing the boundaries of the new transcripts. :return: """ spans = [] new_transcripts = [] for counter, (boundary, bed12_objects) in enumerate( sorted(cds_boundaries.items(), key=operator.itemgetter(0))): new_transcript = transcript.__class__() new_transcript.feature = "mRNA" for attribute in ["chrom", "source", "score", "strand", "attributes"]: setattr(new_transcript, attribute, getattr(transcript, attribute)) # Determine which ORFs I have on my right and left new_transcript.parent = transcript.parent left = True right = True if counter == 0: # leftmost left = False if 1 + counter == len(cds_boundaries): # rightmost right = False counter += 1 # Otherwise they start from 0 new_transcript.id = "{0}.split{1}".format(transcript.id, counter) new_transcript.logger = transcript.logger bed12_strand = set(_.strand for _ in bed12_objects) assert len(bed12_strand) == 1 bed12_strand = bed12_strand.pop() transcript.logger.debug("Splitting exons for %s", new_transcript.id) my_exons, discarded_exons, tstart, tend = __create_splitted_exons( transcript, boundary, left, right, bed12_strand) transcript.logger.debug( """TID %s counter %d, boundary %s, left %s right %s""", transcript.id, counter, boundary, left, right) if right is True: transcript.logger.debug("TID %s TEND %d Boun[1] %s", transcript.id, tend, boundary[1]) if left is True: transcript.logger.debug("TID %s TSTART %d Boun[0] %s", transcript.id, tstart, boundary[0]) assert len(my_exons) > 0, (discarded_exons, boundary) new_transcript.exons = my_exons new_transcript.start = min(exon[0] for exon in new_transcript.exons) new_transcript.end = max(exon[1] for exon in new_transcript.exons) assert new_transcript.end <= transcript.end assert new_transcript.start >= transcript.start assert new_transcript.is_coding is False new_transcript.json_conf = transcript.json_conf # Now we have to modify the BED12s to reflect # the fact that we are starting/ending earlier new_transcript.finalize() # if transcript.monoexonic is True: # if new_transcript.monoexonic is True: # new_transcript.strand = None transcript.logger.debug( "Relocating %d ORFs into the new transcript (%d, %d), \ tcoordinates (%d, %d)", len(bed12_objects), new_transcript.start, new_transcript.end, tstart, tend) new_bed12s = __relocate_orfs(transcript, bed12_objects, tstart, tend) assert len([_ for _ in new_bed12s if _.strand == "+"]) > 0 transcript.logger.debug( "Loading %d ORFs into the new transcript (%d, %d): %s", len(new_bed12s), new_transcript.start, new_transcript.end, "\n\t" + "\n".join([str(_) for _ in new_bed12s])) new_transcript.logger = transcript.logger new_transcript.load_orfs(new_bed12s) if new_transcript.selected_cds_length <= 0: err_message = "No CDS information retained for {0} split {1}\n".format( transcript.id, counter) err_message += "BED: {0}".format("\n\t".join( [str(x) for x in new_bed12s])) raise InvalidTranscript(err_message) # Load the blast hits __load_blast_hits(new_transcript, boundary, transcript) new_transcript.finalize() new_transcripts.append(new_transcript) nspan = (new_transcript.start, new_transcript.end) transcript.logger.debug( "Transcript {0} split {1}, discarded exons: {2}".format( transcript.id, counter, discarded_exons)) __check_collisions(transcript, nspan, spans) spans.append([new_transcript.start, new_transcript.end]) return new_transcripts