Пример #1
0
    def test_load_pos_and_neg(self):

        b1 = BED12(transcriptomic=True)
        b1.chrom = self.tr.id
        b1.start = 0
        b1.end = self.tr.cdna_length - 1
        b1.strand = "+"
        b1.name = "first"
        b1.thick_start = 101
        b1.thick_end = 190
        self.assertFalse(b1.invalid)

        b2 = b1.copy()
        b2.strand = "-"
        b2.thick_start = 1
        b2.thick_end = 87
        b2.name = "second"
        self.assertFalse(b2.invalid)
        with self.assertLogs("null", "DEBUG") as _:
            after_overlap_check = retrieval.find_overlapping_cds(
                self.tr, [b1, b2])
        # print(*_.output, sep="\n")

        self.assertEqual(len(after_overlap_check), 2,
                         self.tr.json_conf["pick"]["orf_loading"])
        self.assertEqual(after_overlap_check, [b1, b2],
                         [_.name for _ in after_overlap_check])
        retrieval.load_orfs(self.tr, [b1, b2])
        self.assertEqual(self.tr.number_internal_orfs, 1)
        self.assertEqual(self.tr.combined_cds_start, 201,
                         self.tr.combined_cds_start)
        self.assertEqual(self.tr.combined_cds_length, 90)
Пример #2
0
def as_bed12(transcript):
    """
    Method to create a BED12 object for printing
    :param transcript: Mikado.loci.transcript.Transcript
    :return:
    """

    bed12 = BED12()
    bed12.transcriptomic = False
    bed12.header = False
    bed12.chrom = transcript.chrom
    bed12.start = transcript.start
    bed12.end = transcript.end
    bed12.name = transcript.id
    bed12.score = transcript.score
    bed12.strand = transcript.strand
    if transcript.is_coding:
        bed12.thick_start = transcript.combined_cds[0][0]
        bed12.thick_end = transcript.combined_cds[-1][1]
    else:
        bed12.thick_start = bed12.thick_end = bed12.start
    bed12.block_count = transcript.exon_num
    bed12.block_sizes = [exon[1] - exon[0] + 1 for exon in transcript.exons]
    bed12.block_starts = [0]
    for pos, intron in enumerate(sorted(transcript.introns)):
        bed12.block_starts.append(bed12.block_starts[pos] +
                                  bed12.block_sizes[pos] + intron[1] -
                                  intron[0] + 1)
    return bed12
Пример #3
0
    def test_load_invalid_length(self):

        b_invalid = BED12(transcriptomic=True)
        b_invalid.chrom = self.tr.id
        self.assertTrue(b_invalid.transcriptomic)
        # b_invalid.name = self.tr.id
        b_invalid.start = 0
        b_invalid.strand = "+"
        b_invalid.end = self.tr.cdna_length + 10
        b_invalid.thick_start = 101
        b_invalid.thick_end = 190
        self.assertEqual(b_invalid.chrom,
                         b_invalid.id,
                         b_invalid.id)

        with self.assertLogs("null", "WARNING") as cm:
            retrieval.load_orfs(self.tr, [b_invalid])

        found_message = False
        for _ in cm.output:
            if "Wrong ORF for {}:".format(self.tr.id) in _:
                found_message = True
                break

        self.assertTrue(found_message, cm.output)
Пример #4
0
    def test_filter_non_transcriptomic(self):

        b_valid = BED12(transcriptomic=True)
        b_valid.chrom = self.tr.id
        b_valid.name = "valid"
        b_valid.start, b_valid.end, b_valid.strand = 0, self.tr.cdna_length - 1, "+"
        b_valid.thick_start, b_valid.thick_end = 101, 190

        b_invalid = b_valid.copy()
        b_invalid.name = "non-transcriptomic"
        b_invalid.transcriptomic = False

        retained = retrieval.find_overlapping_cds(self.tr, [b_invalid, b_valid])
        self.assertEqual(retained, [b_valid])
Пример #5
0
    def prepare_info(self, transcript):

        cdna = str(self.fai[transcript]).upper()
        bed_position = self.__found_in_bed[transcript]
        self.bedfile.seek(bed_position)
        line = self.bedfile.readline()
        bed = BED12(line)
        assert bed.name == transcript, (bed.name, transcript, bed_position)

        new_bed = bed.to_transcriptomic(sequence=cdna, lenient=True)
        if new_bed.coding is False and bed.coding is True:
            raise AssertionError(
                "The transcriptomic BED has been transformed incorrectly. Reason: {}"
                .format(new_bed.invalid_reason))
        if bed.phase and new_bed.phase != bed.phase:
            raise AssertionError(
                "The transcriptomic BED has been transformed incorrectly. Phases: {}, {}, {}"
                .format(new_bed.phase, bed.phase, line))
        pep = str(
            Seq.Seq(
                str(cdna[max(0 + new_bed.phase, new_bed.thick_start -
                             1):new_bed.thick_end])).translate())
        return cdna, new_bed, pep
Пример #6
0
    def test_load_invalid_multiple(self):

        b_valid = BED12(transcriptomic=True)
        b_valid.chrom = self.tr.id
        b_valid.name = "valid"
        b_valid.start, b_valid.end, b_valid.strand = 0, self.tr.cdna_length - 1, "+"
        b_valid.thick_start, b_valid.thick_end = 101, 190

        b_invalid = b_valid.copy()
        b_invalid.name = "invalid"
        b_invalid.thick_start = 1
        b_invalid.thick_end = 89
        b_invalid.phase = 0

        self.assertTrue(b_invalid.invalid)
        self.assertFalse(b_valid.invalid, b_valid.invalid_reason)

        with self.assertLogs("null", "DEBUG") as _:
            retrieval.load_orfs(self.tr, [b_valid, b_invalid])

        # print(*cm.output, sep="\n")

        self.assertEqual(self.tr.number_internal_orfs, 1)
Пример #7
0
def as_bed12(transcript, transcriptomic=False):
    """
    Method to create a BED12 object for printing
    :param transcript: Mikado.loci.transcript.Transcript
    :return:
    """

    transcript.finalize()
    bed12 = BED12(table=transcript.codon_table)
    bed12.transcriptomic = False
    bed12.header = False
    bed12.chrom = transcript.chrom
    bed12.start = transcript.start
    bed12.end = transcript.end

    if transcript.is_coding is True:
        if transcript.strand != "-":
            try:
                phase = transcript.phases[transcript.selected_cds[0]]
            except KeyError:
                raise KeyError((transcript.selected_cds[0], transcript.phases))
        else:
            try:
                phase = transcript.phases[transcript.selected_cds[-1]]
            except KeyError:
                raise KeyError(
                    (transcript.selected_cds[-1], transcript.phases))

        name = "ID={ID};coding={coding};phase={phase}".format(
            ID=transcript.id,
            coding=transcript.is_coding,
            # Now we have to get the phase of the first CDS exon ..
            phase=phase)
    else:
        name = "ID={ID};coding={coding}".format(
            ID=transcript.id,
            coding=transcript.is_coding,
            # Now we have to get the phase of the first CDS exon ..
        )

    if transcript.alias is not None and transcript.alias != transcript.id:
        name += ";alias={}".format(transcript.alias)

    bed12.name = name
    bed12.score = transcript.score if transcript.score else 0
    bed12.strand = transcript.strand
    if transcript.is_coding:
        bed12.coding = True
        first_exon = [
            _ for _ in transcript.selected_cds
            if transcript.selected_cds_start in _
        ]
        assert len(first_exon) == 1
        bed12.phase = transcript.phases[first_exon.pop()]
        bed12.thick_start = transcript.selected_cds[0][0]
        bed12.thick_end = transcript.selected_cds[-1][1]
    else:
        bed12.thick_start = bed12.thick_end = bed12.start
    bed12.block_count = transcript.exon_num
    bed12.block_sizes = [exon[1] - exon[0] + 1 for exon in transcript.exons]
    bed12.block_starts = [0]
    for pos, intron in enumerate(sorted(transcript.introns)):
        bed12.block_starts.append(bed12.block_starts[pos] +
                                  bed12.block_sizes[pos] + intron[1] -
                                  intron[0] + 1)
    if transcriptomic:
        bed12 = bed12.to_transcriptomic(alias=transcript.alias,
                                        start_adjustment=False,
                                        coding=transcript.is_coding)

        bed12.chrom = transcript.id
    return bed12
Пример #8
0
def transfer_cds(transcript: Transcript,
                 ref_cdna: str,
                 ref_bed: BED12,
                 target_cdna: str,
                 target_bed: BED12,
                 logger=create_null_logger()):

    if transcript is None:
        return transcript, target_bed, (None, None, False)

    transcript.finalize()
    assert target_bed.transcriptomic is True

    logger.debug("Starting with %s, phases: %s (BED %s)", transcript.id,
                 transcript.phases, target_bed.phase)

    if ref_bed.coding is False:
        logger.debug("%s is non coding, returning immediately.", transcript.id,
                     transcript.phases)
        transcript.attributes["aligner_cds"] = False
        transcript.attributes["was_coding"] = transcript.is_coding
        target_bed.coding = False
        transcript.strip_cds()
        pep_coords = (None, None, True)
    else:
        original_start, original_end = target_bed.thick_start, target_bed.thick_end
        original_phase, original_phases = target_bed.phase, transcript.phases.copy(
        )
        ref_pep = str(
            Seq.Seq(str(
                ref_cdna[ref_bed.thick_start -
                         1:ref_bed.thick_end])).translate(to_stop=False))

        ref_has_multiple_stops = False
        if ref_pep.count("*") == 0:
            pass
        elif abs(ref_pep.index("*") * 3 - ref_bed.cds_len) in (0, 3):
            ref_pep = ref_pep[:ref_pep.index(
                "*")]  # This is the "good" case: the CDS is correct.
        else:
            ref_has_multiple_stops = True
            logger.warning(
                "The sequence of %s has in frame stop codons. Adjusting the program to take this into account.",
                ref_bed.name)

        logger.debug("%s now has phases: %s (%s)", transcript.id,
                     transcript.phases, target_bed.phase)
        target_bed, pep_coords = transfer_by_alignment(ref_pep,
                                                       target_cdna,
                                                       target_bed,
                                                       logger=logger)
        logger.debug("%s now has phases: %s; target bed: %s", transcript.id,
                     transcript.phases, target_bed.phase)
        pep_coords = (pep_coords[0], pep_coords[1],
                      (pep_coords[0] == 1 and pep_coords[1] == len(ref_pep)))

        if target_bed.thick_start == original_start and target_bed.thick_end == original_end:
            transcript.attributes["aligner_cds"] = True
            logger.debug("%s now has phases: %s", transcript.id,
                         transcript.phases)
        else:
            transcript.attributes["aligner_cds"] = False
            transcript.strip_cds()
            if target_bed.coding is True:
                transcript.load_orfs([target_bed])

        logger.debug("%s now has phases: %s", transcript.id, transcript.phases)
        # Now we have to decide whether the transcript has the "original" CDS or not
        result, cigar = transfer.get_and_prepare_cigar(str(ref_cdna),
                                                       str(target_cdna))
        ref_array, target_array = transfer.create_translation_array(cigar)
        try:
            target_start = target_array[ref_array.index(ref_bed.thick_start)]
        except IndexError:
            target_start = target_bed.start
        try:
            target_end = target_array[ref_array.index(ref_bed.thick_end)]
        except IndexError:
            target_end = target_bed.end

        if target_start == target_bed.thick_start and target_end == target_bed.thick_end:
            transcript.attributes["original_cds"] = True
        else:
            transcript.attributes["original_cds"] = False

        if ref_cdna == target_cdna:
            logger.debug("%s now has phases: %s", transcript.id,
                         transcript.phases)
            if transcript.is_coding is False:
                raise AssertionError("{} not coding".format(transcript.id))
            elif transcript.attributes["original_cds"] is False:
                raise AssertionError("\n".join([
                    str(_) for _ in [
                        transcript.id,
                        (target_bed.thick_start, target_start,
                         target_bed.thick_start == target_start),
                        (target_bed.thick_end, target_end,
                         target_bed.thick_end == target_end
                         ), target_bed.thick_start == target_start
                        and target_bed.thick_end == target_end
                    ]
                ]))

    return transcript, target_bed, pep_coords