def test_non_coding_negative(self): tr = Transcript() tr.chrom = "Chr1" tr.start = 101 tr.end = 2000 tr.strand = "-" tr.add_exons([(101, 300), (1701, 2000)]) tr.id = "test1" tr.parent = "gene1" tr.finalize() gff = tr.format("gff", with_introns=True) self.maxDiff = None res = """Chr1\tMikado\ttranscript\t101\t2000\t.\t-\t.\tID=test1;Parent=gene1 Chr1\tMikado\texon\t101\t300\t.\t-\t.\tID=test1.exon1;Parent=test1 Chr1\tMikado\tintron\t301\t1700\t.\t-\t.\tID=test1.intron1;Parent=test1 Chr1\tMikado\texon\t1701\t2000\t.\t-\t.\tID=test1.exon2;Parent=test1""" self.assertEqual(gff, res, "++++\n\n" + "\n+++\n".join([gff, res])) gtf = tr.format("gtf", with_introns=True) res = """Chr1\tMikado\ttranscript\t101\t2000\t.\t-\t.\tgene_id "gene1"; transcript_id "test1"; Chr1\tMikado\texon\t101\t300\t.\t-\t.\tgene_id "gene1"; transcript_id "test1"; Chr1\tMikado\tintron\t301\t1700\t.\t-\t.\tgene_id "gene1"; transcript_id "test1"; Chr1\tMikado\texon\t1701\t2000\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";""" self.assertEqual(gtf, res, "++++\n\n" + "\n+++\n".join([gtf, res]))
def test_coding_negative(self): tr = Transcript() tr.chrom = "Chr1" tr.start = 101 tr.end = 2000 tr.strand = "-" tr.add_exons([(101, 300), (1701, 2000)]) tr.add_exons([(101, 300), (1701, 2000)], features="CDS") tr.id = "test1" tr.parent = "gene1" # Phase 0, 0 because the first CDS exon is 300bp gff = tr.format("gff", with_introns=True) self.maxDiff = None res = """Chr1\tMikado\tmRNA\t101\t2000\t.\t-\t.\tID=test1;Parent=gene1;Name=test1 Chr1\tMikado\tCDS\t101\t300\t.\t-\t0\tID=test1.CDS1;Parent=test1 Chr1\tMikado\texon\t101\t300\t.\t-\t.\tID=test1.exon1;Parent=test1 Chr1\tMikado\tintron\t301\t1700\t.\t-\t.\tID=test1.intron1;Parent=test1 Chr1\tMikado\tCDS\t1701\t2000\t.\t-\t0\tID=test1.CDS2;Parent=test1 Chr1\tMikado\texon\t1701\t2000\t.\t-\t.\tID=test1.exon2;Parent=test1""" self.assertEqual( gff, res, "++++\n\n" + "\n+++\n".join( [gff, res, ",\t".join([str(_) for _ in tr.internal_orfs])])) gtf = tr.format("gtf", with_introns=True) res = """Chr1\tMikado\tmRNA\t101\t2000\t.\t-\t.\tgene_id "gene1"; transcript_id "test1"; Name "test1"; Chr1\tMikado\tCDS\t101\t300\t.\t-\t0\tgene_id "gene1"; transcript_id "test1"; Chr1\tMikado\texon\t101\t300\t.\t-\t.\tgene_id "gene1"; transcript_id "test1"; Chr1\tMikado\tintron\t301\t1700\t.\t-\t.\tgene_id "gene1"; transcript_id "test1"; Chr1\tMikado\tCDS\t1701\t2000\t.\t-\t0\tgene_id "gene1"; transcript_id "test1"; Chr1\tMikado\texon\t1701\t2000\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";""" self.assertEqual(gtf, res, "++++\n\n" + "\n+++\n".join([gtf, res]))
def test_coding_positive(self): tr = Transcript() tr.chrom = "Chr1" tr.start = 101 tr.end = 2000 tr.strand = "+" tr.add_exons([(101, 300), (1701, 2000)]) tr.add_exons([(101, 300), (1701, 2000)], features="CDS") tr.id = "test1" tr.parent = "gene1" gff = tr.format("gff", with_introns=True) self.maxDiff = None res = """Chr1\tMikado\tmRNA\t101\t2000\t.\t+\t.\tID=test1;Parent=gene1;Name=test1 Chr1\tMikado\tCDS\t101\t300\t.\t+\t0\tID=test1.CDS1;Parent=test1 Chr1\tMikado\texon\t101\t300\t.\t+\t.\tID=test1.exon1;Parent=test1 Chr1\tMikado\tintron\t301\t1700\t.\t+\t.\tID=test1.intron1;Parent=test1 Chr1\tMikado\tCDS\t1701\t2000\t.\t+\t1\tID=test1.CDS2;Parent=test1 Chr1\tMikado\texon\t1701\t2000\t.\t+\t.\tID=test1.exon2;Parent=test1""" self.assertEqual(gff, res, "++++\n\n" + "\n+++\n".join([gff, res])) gtf = tr.format("gtf", with_introns=True) res = """Chr1\tMikado\tmRNA\t101\t2000\t.\t+\t.\tgene_id "gene1"; transcript_id "test1"; Name "test1"; Chr1\tMikado\tCDS\t101\t300\t.\t+\t0\tgene_id "gene1"; transcript_id "test1"; Chr1\tMikado\texon\t101\t300\t.\t+\t.\tgene_id "gene1"; transcript_id "test1"; Chr1\tMikado\tintron\t301\t1700\t.\t+\t.\tgene_id "gene1"; transcript_id "test1"; Chr1\tMikado\tCDS\t1701\t2000\t.\t+\t2\tgene_id "gene1"; transcript_id "test1"; Chr1\tMikado\texon\t1701\t2000\t.\t+\t.\tgene_id "gene1"; transcript_id "test1";""" self.assertEqual(gtf, res, "++++\n\n" + "\n+++\n".join([gtf, res]))
def test_non_coding_negative(self): tr = Transcript() tr.chrom = "Chr1" tr.start = 101 tr.end = 2000 tr.strand = "-" tr.add_exons([(101, 300), (1701, 2000)]) tr.id = "test1" tr.parent = "gene1" tr.finalize() gff = tr.format("gff", with_introns=True) self.maxDiff = None res = """Chr1\tMikado\ttranscript\t101\t2000\t.\t-\t.\tID=test1;Parent=gene1 Chr1\tMikado\texon\t101\t300\t.\t-\t.\tID=test1.exon1;Parent=test1 Chr1\tMikado\tintron\t301\t1700\t.\t-\t.\tID=test1.intron1;Parent=test1 Chr1\tMikado\texon\t1701\t2000\t.\t-\t.\tID=test1.exon2;Parent=test1""" self.assertEqual(gff, res, "++++\n\n"+"\n+++\n".join([gff, res])) gtf = tr.format("gtf", with_introns=True) res = """Chr1\tMikado\ttranscript\t101\t2000\t.\t-\t.\tgene_id "gene1"; transcript_id "test1"; Chr1\tMikado\texon\t101\t300\t.\t-\t.\tgene_id "gene1"; transcript_id "test1"; Chr1\tMikado\tintron\t301\t1700\t.\t-\t.\tgene_id "gene1"; transcript_id "test1"; Chr1\tMikado\texon\t1701\t2000\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";""" self.assertEqual(gtf, res, "++++\n\n" + "\n+++\n".join([gtf, res]))
def test_caseNegative(self): tr = Transcript() tr.chrom, tr.start, tr.end, tr.strand = "Chr1", 101, 3000, "-" tr.id = "test1" tr.add_exons([(101, 300), (401, 600), (801, 1200), (2501, 3000) ]) tr.add_exons([(421, 600), # 180 (801, 1200), # 400 (2501, 2700) # 200 = 780 % 3 == 0 ], features="CDS") with self.assertLogs("null", "DEBUG") as _: tr.finalize() self.assertTrue(tr.is_coding) b12 = tr.as_bed12() self.assertEqual(b12.thick_start, tr.combined_cds_end) self.assertEqual(b12.thick_end, tr.combined_cds_start) self.assertEqual(len(b12.block_sizes), tr.exon_num) self.assertEqual(b12.block_sizes, [200, 200, 400, 500], b12.block_sizes) self.assertEqual(b12.strand, "-") self.assertEqual(b12.block_starts, [0, 300, 700, 2400], b12.block_starts) self.assertEqual(tr.format("bed12"), str(b12)) self.assertEqual(str(b12), "\t".join([str(_) for _ in ["Chr1", 100, 3000, tr.id, 0, tr.strand, b12.thick_start - 1, b12.thick_end, 0, 4, ",".join([str(__) for __ in [200, 200, 400, 500]]), ",".join([str(___) for ___ in [0, 300, 700, 2400]])]] ))
def main(): parser = argparse.ArgumentParser(__doc__) parser.add_argument("-mi", "--max-intron", default=10000, dest="max_intron", type=int, help="Maximum intron length for UTR introns.") parser.add_argument("gff", type=parser_factory) parser.add_argument("out", default=sys.stdout, type=argparse.FileType("wt"), nargs="?") args = parser.parse_args() if args.max_intron < 0: raise ValueError("Max intron length <0 specified! {0}".format( args.max_intron)) ref_gff = isinstance(args.gff, GFF3) if ref_gff: form = "gff3" else: form = "gtf" current = None current_transcript = None last_header = [] for record in args.gff: if record.header is True: # print(record, file=sys.stderr) if current is not None: current = remove_introns(current, args) print(current.format(form), file=args.out) print(*last_header, sep="\n", file=args.out, end='') current = None current_transcript = None print(*last_header, sep="\n", end="") last_header = [record] continue if record.feature not in ("gene", "mRNA", "CDS", "exon"): continue if record.is_gene is True and ref_gff: print(record, file=sys.stderr) last_header = [] if current is not None: # current = remove_introns(current, args) print(current.format(form), file=args.out) print(*last_header, sep="\n", file=args.out, end='') current = None current_transcript = None if record.is_transcript: if ref_gff is False: if current_transcript is not None: # current_transcript = remove_introns_from_transcr(current_transcript, # args) assert current_transcript.combined_cds_length > 0 print(current_transcript, file=args.out) print(*last_header, sep="\n", file=args.out) last_header = [] elif ref_gff is True: if current_transcript is not None: if current is None: current = Gene(current_transcript) current.add(current_transcript) else: assert current_transcript.parent[0] != current.id current.add(current_transcript) # if current.id == current_transcript.parent[0]: # else: # current = remove_introns(current, args) # print(current.format(form), file=args.out) # print("###", file=args.out) # current = None # elif current_transcript is not None: # current = Gene(current_transcript) current_transcript = Transcript(record) elif record.is_exon: if record.feature not in ("CDS", "exon"): continue current_transcript.add_exon(record) else: continue continue if ref_gff and current is not None: print(*last_header, sep="\n", file=args.out) last_header = [] current = remove_introns(current, args) print(current.format(form), file=args.out) elif not ref_gff and current_transcript is not None: current_transcript = remove_introns_from_transcr( current_transcript, args) print(current_transcript.format(form), file=args.out) print(*last_header, sep="\n", file=args.out, end='')
def main(): parser = argparse.ArgumentParser(__doc__) parser.add_argument("-mi", "--max-intron", default=10000, dest="max_intron", type=int, help="Maximum intron length for UTR introns.") parser.add_argument("gff", type=to_gff) parser.add_argument("out", default=sys.stdout, type=argparse.FileType("wt"), nargs="?") args = parser.parse_args() if args.max_intron < 0: raise ValueError("Max intron length <0 specified! {0}".format(args.max_intron)) ref_gff = isinstance(args.gff, GFF3) if ref_gff: form = "gff3" else: form = "gtf" current = None current_transcript = None last_header = [] for record in args.gff: if record.header is True: # print(record, file=sys.stderr) if current is not None: current = remove_introns(current, args) print(current.format(form), file=args.out) print(*last_header, sep="\n", file=args.out, end='') current = None current_transcript = None print(*last_header, sep="\n", end="") last_header = [record] continue if record.feature not in ("gene", "mRNA", "CDS", "exon"): continue if record.is_gene is True and ref_gff: print(record, file=sys.stderr) last_header = [] if current is not None: # current = remove_introns(current, args) print(current.format(form), file=args.out) print(*last_header, sep="\n", file=args.out, end='') current = None current_transcript = None if record.is_transcript: if ref_gff is False: if current_transcript is not None: # current_transcript = remove_introns_from_transcr(current_transcript, # args) assert current_transcript.combined_cds_length > 0 print(current_transcript, file=args.out) print(*last_header, sep="\n", file=args.out) last_header = [] elif ref_gff is True: if current_transcript is not None: if current is None: current = Gene(current_transcript) current.add(current_transcript) else: assert current_transcript.parent[0] != current.id current.add(current_transcript) # if current.id == current_transcript.parent[0]: # else: # current = remove_introns(current, args) # print(current.format(form), file=args.out) # print("###", file=args.out) # current = None # elif current_transcript is not None: # current = Gene(current_transcript) current_transcript = Transcript(record) elif record.is_exon: if record.feature not in ("CDS", "exon"): continue current_transcript.add_exon(record) else: continue continue if ref_gff and current is not None: print(*last_header, sep="\n", file=args.out) last_header = [] current = remove_introns(current, args) print(current.format(form), file=args.out) elif not ref_gff and current_transcript is not None: current_transcript = remove_introns_from_transcr(current_transcript, args) print(current_transcript.format(form), file=args.out) print(*last_header, sep="\n", file=args.out, end='')