Пример #1
0
    def test_non_coding_negative(self):
        tr = Transcript()
        tr.chrom = "Chr1"
        tr.start = 101
        tr.end = 2000
        tr.strand = "-"
        tr.add_exons([(101, 300), (1701, 2000)])
        tr.id = "test1"
        tr.parent = "gene1"
        tr.finalize()

        gff = tr.format("gff", with_introns=True)
        self.maxDiff = None
        res = """Chr1\tMikado\ttranscript\t101\t2000\t.\t-\t.\tID=test1;Parent=gene1
Chr1\tMikado\texon\t101\t300\t.\t-\t.\tID=test1.exon1;Parent=test1
Chr1\tMikado\tintron\t301\t1700\t.\t-\t.\tID=test1.intron1;Parent=test1
Chr1\tMikado\texon\t1701\t2000\t.\t-\t.\tID=test1.exon2;Parent=test1"""
        self.assertEqual(gff, res, "++++\n\n" + "\n+++\n".join([gff, res]))

        gtf = tr.format("gtf", with_introns=True)
        res = """Chr1\tMikado\ttranscript\t101\t2000\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\texon\t101\t300\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\tintron\t301\t1700\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\texon\t1701\t2000\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";"""
        self.assertEqual(gtf, res, "++++\n\n" + "\n+++\n".join([gtf, res]))
Пример #2
0
    def test_coding_negative(self):
        tr = Transcript()
        tr.chrom = "Chr1"
        tr.start = 101
        tr.end = 2000
        tr.strand = "-"
        tr.add_exons([(101, 300), (1701, 2000)])
        tr.add_exons([(101, 300), (1701, 2000)], features="CDS")
        tr.id = "test1"
        tr.parent = "gene1"

        # Phase 0, 0 because the first CDS exon is 300bp
        gff = tr.format("gff", with_introns=True)
        self.maxDiff = None
        res = """Chr1\tMikado\tmRNA\t101\t2000\t.\t-\t.\tID=test1;Parent=gene1;Name=test1
Chr1\tMikado\tCDS\t101\t300\t.\t-\t0\tID=test1.CDS1;Parent=test1
Chr1\tMikado\texon\t101\t300\t.\t-\t.\tID=test1.exon1;Parent=test1
Chr1\tMikado\tintron\t301\t1700\t.\t-\t.\tID=test1.intron1;Parent=test1
Chr1\tMikado\tCDS\t1701\t2000\t.\t-\t0\tID=test1.CDS2;Parent=test1
Chr1\tMikado\texon\t1701\t2000\t.\t-\t.\tID=test1.exon2;Parent=test1"""
        self.assertEqual(
            gff, res, "++++\n\n" + "\n+++\n".join(
                [gff, res, ",\t".join([str(_) for _ in tr.internal_orfs])]))

        gtf = tr.format("gtf", with_introns=True)
        res = """Chr1\tMikado\tmRNA\t101\t2000\t.\t-\t.\tgene_id "gene1"; transcript_id "test1"; Name "test1";
Chr1\tMikado\tCDS\t101\t300\t.\t-\t0\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\texon\t101\t300\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\tintron\t301\t1700\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\tCDS\t1701\t2000\t.\t-\t0\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\texon\t1701\t2000\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";"""
        self.assertEqual(gtf, res, "++++\n\n" + "\n+++\n".join([gtf, res]))
Пример #3
0
    def test_coding_positive(self):
        tr = Transcript()
        tr.chrom = "Chr1"
        tr.start = 101
        tr.end = 2000
        tr.strand = "+"
        tr.add_exons([(101, 300),
                      (1701, 2000)])
        tr.add_exons([(101, 300),
                      (1701, 2000)], features="CDS")
        tr.id = "test1"
        tr.parent = "gene1"

        gff = tr.format("gff", with_introns=True)
        self.maxDiff = None
        res = """Chr1\tMikado\tmRNA\t101\t2000\t.\t+\t.\tID=test1;Parent=gene1;Name=test1
Chr1\tMikado\tCDS\t101\t300\t.\t+\t0\tID=test1.CDS1;Parent=test1
Chr1\tMikado\texon\t101\t300\t.\t+\t.\tID=test1.exon1;Parent=test1
Chr1\tMikado\tintron\t301\t1700\t.\t+\t.\tID=test1.intron1;Parent=test1
Chr1\tMikado\tCDS\t1701\t2000\t.\t+\t1\tID=test1.CDS2;Parent=test1
Chr1\tMikado\texon\t1701\t2000\t.\t+\t.\tID=test1.exon2;Parent=test1"""
        self.assertEqual(gff,
                         res,
                         "++++\n\n" + "\n+++\n".join([gff, res]))

        gtf = tr.format("gtf", with_introns=True)
        res = """Chr1\tMikado\tmRNA\t101\t2000\t.\t+\t.\tgene_id "gene1"; transcript_id "test1"; Name "test1";
Chr1\tMikado\tCDS\t101\t300\t.\t+\t0\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\texon\t101\t300\t.\t+\t.\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\tintron\t301\t1700\t.\t+\t.\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\tCDS\t1701\t2000\t.\t+\t2\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\texon\t1701\t2000\t.\t+\t.\tgene_id "gene1"; transcript_id "test1";"""
        self.assertEqual(gtf, res,
                         "++++\n\n" + "\n+++\n".join([gtf, res]))
Пример #4
0
    def test_non_coding_negative(self):
        tr = Transcript()
        tr.chrom = "Chr1"
        tr.start = 101
        tr.end = 2000
        tr.strand = "-"
        tr.add_exons([(101, 300),
                      (1701, 2000)])
        tr.id = "test1"
        tr.parent = "gene1"
        tr.finalize()

        gff = tr.format("gff", with_introns=True)
        self.maxDiff = None
        res = """Chr1\tMikado\ttranscript\t101\t2000\t.\t-\t.\tID=test1;Parent=gene1
Chr1\tMikado\texon\t101\t300\t.\t-\t.\tID=test1.exon1;Parent=test1
Chr1\tMikado\tintron\t301\t1700\t.\t-\t.\tID=test1.intron1;Parent=test1
Chr1\tMikado\texon\t1701\t2000\t.\t-\t.\tID=test1.exon2;Parent=test1"""
        self.assertEqual(gff,
                         res,
                         "++++\n\n"+"\n+++\n".join([gff, res]))

        gtf = tr.format("gtf", with_introns=True)
        res = """Chr1\tMikado\ttranscript\t101\t2000\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\texon\t101\t300\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\tintron\t301\t1700\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\texon\t1701\t2000\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";"""
        self.assertEqual(gtf, res,
                         "++++\n\n" + "\n+++\n".join([gtf, res]))
Пример #5
0
    def test_caseNegative(self):
        tr = Transcript()
        tr.chrom, tr.start, tr.end, tr.strand = "Chr1", 101, 3000, "-"
        tr.id = "test1"
        tr.add_exons([(101, 300),
                      (401, 600),
                      (801, 1200),
                      (2501, 3000)
                      ])

        tr.add_exons([(421, 600),  # 180
                      (801, 1200),  # 400
                      (2501, 2700)  # 200  = 780 % 3 == 0
                      ], features="CDS")
        with self.assertLogs("null", "DEBUG") as _:
            tr.finalize()
        self.assertTrue(tr.is_coding)

        b12 = tr.as_bed12()
        self.assertEqual(b12.thick_start, tr.combined_cds_end)
        self.assertEqual(b12.thick_end, tr.combined_cds_start)
        self.assertEqual(len(b12.block_sizes), tr.exon_num)
        self.assertEqual(b12.block_sizes,
                         [200, 200, 400, 500],
                         b12.block_sizes)
        self.assertEqual(b12.strand, "-")
        self.assertEqual(b12.block_starts,
                         [0, 300, 700, 2400],
                         b12.block_starts)

        self.assertEqual(tr.format("bed12"), str(b12))
        self.assertEqual(str(b12),
                         "\t".join([str(_) for _ in
                                    ["Chr1", 100, 3000, tr.id, 0, tr.strand,
                                     b12.thick_start - 1, b12.thick_end,
                                     0, 4,
                                     ",".join([str(__) for __ in [200, 200, 400, 500]]),
                                     ",".join([str(___) for ___ in [0, 300, 700, 2400]])]]
                                   ))
Пример #6
0
def main():

    parser = argparse.ArgumentParser(__doc__)
    parser.add_argument("-mi",
                        "--max-intron",
                        default=10000,
                        dest="max_intron",
                        type=int,
                        help="Maximum intron length for UTR introns.")
    parser.add_argument("gff", type=parser_factory)
    parser.add_argument("out",
                        default=sys.stdout,
                        type=argparse.FileType("wt"),
                        nargs="?")
    args = parser.parse_args()

    if args.max_intron < 0:
        raise ValueError("Max intron length <0 specified! {0}".format(
            args.max_intron))

    ref_gff = isinstance(args.gff, GFF3)
    if ref_gff:
        form = "gff3"
    else:
        form = "gtf"

    current = None
    current_transcript = None

    last_header = []
    for record in args.gff:
        if record.header is True:
            # print(record, file=sys.stderr)
            if current is not None:
                current = remove_introns(current, args)
                print(current.format(form), file=args.out)
                print(*last_header, sep="\n", file=args.out, end='')
                current = None
                current_transcript = None
            print(*last_header, sep="\n", end="")
            last_header = [record]
            continue
        if record.feature not in ("gene", "mRNA", "CDS", "exon"):
            continue

        if record.is_gene is True and ref_gff:
            print(record, file=sys.stderr)
            last_header = []
            if current is not None:
                # current = remove_introns(current, args)
                print(current.format(form), file=args.out)
                print(*last_header, sep="\n", file=args.out, end='')
                current = None
                current_transcript = None
        if record.is_transcript:
            if ref_gff is False:
                if current_transcript is not None:
                    # current_transcript = remove_introns_from_transcr(current_transcript,
                    #                                                 args)
                    assert current_transcript.combined_cds_length > 0
                    print(current_transcript, file=args.out)
                    print(*last_header, sep="\n", file=args.out)
                    last_header = []
            elif ref_gff is True:
                if current_transcript is not None:
                    if current is None:
                        current = Gene(current_transcript)
                        current.add(current_transcript)
                    else:
                        assert current_transcript.parent[0] != current.id
                        current.add(current_transcript)
                    # if current.id == current_transcript.parent[0]:

                    # else:
                    #     current = remove_introns(current, args)
                    #     print(current.format(form), file=args.out)
                    #     print("###", file=args.out)
                    #     current = None

                # elif current_transcript is not None:
                #     current = Gene(current_transcript)

            current_transcript = Transcript(record)
        elif record.is_exon:
            if record.feature not in ("CDS", "exon"):
                continue
            current_transcript.add_exon(record)
        else:
            continue
        continue

    if ref_gff and current is not None:
        print(*last_header, sep="\n", file=args.out)
        last_header = []
        current = remove_introns(current, args)
        print(current.format(form), file=args.out)
    elif not ref_gff and current_transcript is not None:
        current_transcript = remove_introns_from_transcr(
            current_transcript, args)
        print(current_transcript.format(form), file=args.out)
        print(*last_header, sep="\n", file=args.out, end='')
Пример #7
0
def main():

    parser = argparse.ArgumentParser(__doc__)
    parser.add_argument("-mi", "--max-intron", default=10000, dest="max_intron",
                        type=int, help="Maximum intron length for UTR introns.")
    parser.add_argument("gff", type=to_gff)
    parser.add_argument("out", default=sys.stdout, type=argparse.FileType("wt"), nargs="?")
    args = parser.parse_args()

    if args.max_intron < 0:
        raise ValueError("Max intron length <0 specified! {0}".format(args.max_intron))

    ref_gff = isinstance(args.gff, GFF3)
    if ref_gff:
        form = "gff3"
    else:
        form = "gtf"

    current = None
    current_transcript = None

    last_header = []
    for record in args.gff:
        if record.header is True:
            # print(record, file=sys.stderr)
            if current is not None:
                current = remove_introns(current, args)
                print(current.format(form), file=args.out)
                print(*last_header, sep="\n", file=args.out, end='')
                current = None
                current_transcript = None
            print(*last_header, sep="\n", end="")
            last_header = [record]
            continue
        if record.feature not in ("gene", "mRNA", "CDS", "exon"):
            continue
        
        if record.is_gene is True and ref_gff:
            print(record, file=sys.stderr)
            last_header = []
            if current is not None:
                # current = remove_introns(current, args)
                print(current.format(form), file=args.out)
                print(*last_header, sep="\n", file=args.out, end='')
                current = None
                current_transcript = None
        if record.is_transcript:
            if ref_gff is False:
                if current_transcript is not None:
                    # current_transcript = remove_introns_from_transcr(current_transcript,
                    #                                                 args)
                    assert current_transcript.combined_cds_length > 0
                    print(current_transcript, file=args.out)
                    print(*last_header, sep="\n", file=args.out)
                    last_header = []
            elif ref_gff is True:
                if current_transcript is not None:
                    if current is None:
                        current = Gene(current_transcript)
                        current.add(current_transcript)
                    else:
                        assert current_transcript.parent[0] != current.id
                        current.add(current_transcript)
                    # if current.id == current_transcript.parent[0]:
                        
                    # else:
                    #     current = remove_introns(current, args)
                    #     print(current.format(form), file=args.out)
                    #     print("###", file=args.out)
                    #     current = None
                
                # elif current_transcript is not None:
                #     current = Gene(current_transcript)

            current_transcript = Transcript(record)
        elif record.is_exon:
            if record.feature not in ("CDS", "exon"):
                continue
            current_transcript.add_exon(record)
        else:
            continue
        continue
    
    if ref_gff and current is not None:
        print(*last_header, sep="\n", file=args.out)
        last_header = []
        current = remove_introns(current, args)
        print(current.format(form), file=args.out)
    elif not ref_gff and current_transcript is not None:
        current_transcript = remove_introns_from_transcr(current_transcript,
                                                         args)
        print(current_transcript.format(form), file=args.out)
        print(*last_header, sep="\n", file=args.out, end='')