def test_merger(self): with tempfile.NamedTemporaryFile(suffix=".tmp", mode="wt", delete=False) as first, \ tempfile.NamedTemporaryFile(suffix=".tmp", mode="wt", delete=False) as second, \ tempfile.NamedTemporaryFile(suffix=".tmp", mode="wt", delete=False) as third,\ tempfile.NamedTemporaryFile(suffix=".out", mode="wt", delete=False) as out: print("1/first case", file=first) print("3/third case", file=first) print("4/fourth case", file=first) print("2/second case", file=second) print("5/fifth case", file=third) print("6/sixth case", file=second) first.flush() second.flush() third.flush() logger = create_default_logger("test_merger", level="DEBUG") utilities.merge_partial([first.name, second.name, third.name], out, logger=logger) with open(out.name) as _: lines = [l for l in _] self.assertEqual(len(lines), 6, (out.name, lines)) self.assertEqual(lines[0], "first case\n", lines) self.assertEqual(lines[1], "second case\n") self.assertEqual(lines[2], "third case\n") self.assertEqual(lines[3], "fourth case\n") self.assertEqual(lines[4], "fifth case\n") self.assertEqual(lines[5], "sixth case\n") # Verify the temporary files have been deleted correctly existing = [] for handle in (first, second, third): if os.path.exists(handle.name): existing.append(handle.name) self.assertEqual(len(existing), 0, [os.remove(name) for name in existing]) self.assertTrue(os.path.exists(out.name)) os.remove(out.name)
def main(): logger = create_default_logger("sanitizer") parser = argparse.ArgumentParser(__doc__) parser.add_argument("-o", "--out", default=sys.stdout, type=argparse.FileType("wt")) parser.add_argument("fasta", nargs="+", type=argparse.FileType("rt")) args = parser.parse_args() found_ids = Counter() starter = 96 for fasta in args.fasta: if len(args.fasta) > 1: starter += 1 prefix = "{}_".format(chr(starter)) else: prefix = "" for record in Bio.SeqIO.parse(fasta, "fasta"): if record.id in found_ids: logger.warning( "ID found other {} time{} in the input files!".format( found_ids[record.id], "s" if found_ids[record.id] > 1 else "")) record.id = "{}{}".format(prefix, record.id) record.description = "" Bio.SeqIO.write(record, args.out, "fasta") args.out.close()
def main(): logger = create_default_logger("sanitizer") parser = argparse.ArgumentParser(__doc__) parser.add_argument("-ml", "--min-length", default=0, type=pos) parser.add_argument("-o", "--out", default=sys.stdout, type=argparse.FileType("wt")) parser.add_argument("fasta", nargs="+", type=argparse.FileType("rt")) args = parser.parse_args() found_ids = Counter() starter = 96 for fasta in args.fasta: if fasta.name.endswith(".gz"): fasta.close() fasta = gzip.open(fasta.name, "rt") if len(args.fasta) > 1: starter += 1 prefix = "{}_".format(chr(starter)) else: prefix = "" for record in Bio.SeqIO.parse(fasta, "fasta"): if record.id in found_ids: logger.warning("ID found other {} time{} in the input files!".format( found_ids[record.id], "s" if found_ids[record.id] > 1 else "")) record.id = "{}{}".format(prefix, re.sub(r"\|", "_", record.id)) record.description = "" if len(record) < args.min_length: continue Bio.SeqIO.write(record, args.out, "fasta") args.out.close()
def main(): logger = create_default_logger("sanitizer") parser = argparse.ArgumentParser(__doc__) parser.add_argument("-o", "--out", default=sys.stdout, type=argparse.FileType("wt")) parser.add_argument("fasta", nargs="+", type=argparse.FileType("rt")) args = parser.parse_args() found_ids = Counter() starter = 96 for fasta in args.fasta: if len(args.fasta) > 1: starter += 1 prefix = "{}_".format(chr(starter)) else: prefix = "" for record in Bio.SeqIO.parse(fasta, "fasta"): if record.id in found_ids: logger.warning("ID found other {} time{} in the input files!".format( found_ids[record.id], "s" if found_ids[record.id] > 1 else "")) record.id = "{}{}".format(prefix, record.id) record.description = "" Bio.SeqIO.write(record, args.out, "fasta") args.out.close()
def __init__(self, out_sq, queue, verbosity="INFO"): super().__init__() self.out_sq = out_sq self.logging_queue = logging_queue self.logger = create_default_logger("") self.log_level = verbosity create_queue_logger(self) self.engine = sqlalchemy.create_engine("sqlite:///{}".format(out_sq)) transfer_base.metadata.create_all(self.engine) self.Session = sessionmaker(bind=self.engine) self.session = self.Session() self.queue = queue
def test_with_no_gff_utr(self): """ Test the creation of the transcript without the UTR lines, verify that everything is still alright :return: """ tr_gff = """Chr2 TAIR10 mRNA 626642 629176 . + . ID=AT2G02380.1;Parent=AT2G02380 Chr2 TAIR10 exon 626642 626780 . + . Parent=AT2G02380.1 Chr2 TAIR10 exon 626842 626880 . + . Parent=AT2G02380.1 Chr2 TAIR10 CDS 626878 626880 . + 0 Parent=AT2G02380.1 Chr2 TAIR10 exon 626963 627059 . + . Parent=AT2G02380.1 Chr2 TAIR10 CDS 626963 627059 . + 0 Parent=AT2G02380.1 Chr2 TAIR10 exon 627137 627193 . + . Parent=AT2G02380.1 Chr2 TAIR10 CDS 627137 627193 . + 2 Parent=AT2G02380.1 Chr2 TAIR10 exon 627312 627397 . + . Parent=AT2G02380.1 Chr2 TAIR10 CDS 627312 627397 . + 2 Parent=AT2G02380.1 Chr2 TAIR10 exon 627488 627559 . + . Parent=AT2G02380.1 Chr2 TAIR10 CDS 627488 627559 . + 0 Parent=AT2G02380.1 Chr2 TAIR10 exon 627696 627749 . + . Parent=AT2G02380.1 Chr2 TAIR10 CDS 627696 627749 . + 0 Parent=AT2G02380.1 Chr2 TAIR10 exon 627840 627915 . + . Parent=AT2G02380.1 Chr2 TAIR10 CDS 627840 627915 . + 0 Parent=AT2G02380.1 Chr2 TAIR10 exon 628044 628105 . + . Parent=AT2G02380.1 Chr2 TAIR10 CDS 628044 628105 . + 2 Parent=AT2G02380.1 Chr2 TAIR10 exon 628182 628241 . + . Parent=AT2G02380.1 Chr2 TAIR10 CDS 628182 628241 . + 0 Parent=AT2G02380.1 Chr2 TAIR10 exon 628465 628676 . + . Parent=AT2G02380.1 Chr2 TAIR10 CDS 628465 628569 . + 0 Parent=AT2G02380.1 Chr2 TAIR10 exon 629070 629176 . + . Parent=AT2G02380.1""" tr_lines = tr_gff.split("\n") logger = create_default_logger("test") logger.setLevel("INFO") for pos, line in enumerate(tr_lines): tr_lines[pos] = re.sub(r"\s+", "\t", line) assert len(tr_lines[pos].split("\t")) == 9, line.split("\t") tr_gff_lines = [parsers.GFF.GffLine(line) for line in tr_lines] transcript = loci.Transcript(tr_gff_lines[0], logger=logger) for line in tr_gff_lines[1:]: transcript.add_exon(line) self.assertEqual(transcript.exons, self.tr.exons) self.assertNotEqual([], transcript.combined_cds) transcript.finalize() self.assertTrue(transcript.is_coding) self.assertEqual(transcript.five_utr, self.tr.five_utr) self.assertEqual(transcript.three_utr, self.tr.three_utr)
def test_split(self): self.tr.load_orfs([self.bed1, self.bed2]) self.assertEqual(self.tr.selected_cds_start, 15494127) self.assertEqual(self.tr.selected_cds_end, 15495994) self.assertEqual(self.tr.combined_cds_start, 15490903) # The other CDS starts at 15494127 logger = create_default_logger(self.tr.id) logger.setLevel("WARN") self.tr.logger = logger new_transcripts = [_ for _ in self.tr.split_by_cds()] self.assertEqual(len(new_transcripts), 2)
def test_split(self): # Chr3 Cufflinks mRNA 2949168 2952410 1000 - . ID=cufflinks_cufflinks_star_at.10687.1.orf1; # Chr3 Cufflinks three_prime_UTR 2949168 2950204 . - . Parent=cufflinks_cufflinks_star_at.10687.1.orf1 # Chr3 Cufflinks exon 2949168 2952410 . - . Parent=cufflinks_cufflinks_star_at.10687.1.orf1 # Chr3 Cufflinks CDS 2,950,205 2,952,208 . - 0 Parent=cufflinks_cufflinks_star_at.10687.1.orf1 # Chr3 Cufflinks five_prime_UTR 2952209 2952410 . - . Parent=cufflinks_cufflinks_star_at.10687.1.orf1 # # Chr3 Cufflinks mRNA 2949168 2952410 1000 - . ID=cufflinks_cufflinks_star_at.10687.1.orf2; # Chr3 Cufflinks three_prime_UTR 2949168 2949169 . - . Parent=cufflinks_cufflinks_star_at.10687.1.orf2 # Chr3 Cufflinks exon 2949168 2952410 . - . Parent=cufflinks_cufflinks_star_at.10687.1.orf2 # Chr3 Cufflinks CDS 2,949,170 2,949,868 . - 0 Parent=cufflinks_cufflinks_star_at.10687.1.orf2 # Chr3 Cufflinks five_prime_UTR 2949869 2952410 . - . Parent=cufflinks_cufflinks_star_at.10687.1.orf2 # =============> # Chr3 Cufflinks mRNA 2950205 2952410 1000 - . ID=cufflinks_cufflinks_star_at.10687.1.split1; # Chr3 Cufflinks exon 2950205 2952410 . - . Parent=cufflinks_cufflinks_star_at.10687.1.split1 # Chr3 Cufflinks CDS 2950205 2952208 . - 0 Parent=cufflinks_cufflinks_star_at.10687.1.split1 # Chr3 Cufflinks five_prime_UTR 2952209 2952410 . - . Parent=cufflinks_cufflinks_star_at.10687.1.split1 # # Chr3 Cufflinks mRNA 2949168 2949868 1000 - . ID=cufflinks_cufflinks_star_at.10687.1.split2; # Chr3 Cufflinks three_prime_UTR 2949168 2949169 . - . Parent=cufflinks_cufflinks_star_at.10687.1.split2 # Chr3 Cufflinks exon 2949168 2949868 . - . Parent=cufflinks_cufflinks_star_at.10687.1.split2 # Chr3 Cufflinks CDS 2949170 2949868 . - 0 Parent=cufflinks_cufflinks_star_at.10687.1.split2 self.tr.load_orfs([self.bed1, self.bed2]) self.assertEqual(self.tr.strand, "-") self.assertEqual(self.tr.number_internal_orfs, 2) self.assertEqual(self.tr.combined_cds_end, 2949170) self.assertEqual(self.tr.combined_cds_start, 2952208) self.assertEqual(self.tr.selected_cds_start, 2952208) self.assertEqual(self.tr.selected_cds_end, 2950205) logger = create_default_logger("splitter") logger.setLevel("ERROR") self.tr.logger = logger new_transcripts = sorted([_ for _ in self.tr.split_by_cds()]) self.assertEqual(new_transcripts[0].start, self.tr.start) self.assertEqual(new_transcripts[0].end, 2949868, "\n\n".join([str(_) for _ in new_transcripts]))
def test_split(self): self.tr.load_orfs([self.bed1, self.bed2]) self.assertEqual(self.tr.number_internal_orfs, 2) logger = create_default_logger("splitter") logger.setLevel("ERROR") self.tr.logger = logger new_transcripts = [_ for _ in self.tr.split_by_cds()] new_transcripts = sorted(new_transcripts, key=operator.attrgetter("start", "end")) self.assertEqual(len(new_transcripts), 2) self.assertEqual(new_transcripts[0].start, 72914) self.assertEqual(new_transcripts[0].end, 74914) self.assertEqual(new_transcripts[1].end, 76276, self.tr.internal_orfs) self.assertEqual(new_transcripts[1].start, 75394) self.assertEqual(new_transcripts[0].selected_cds_start, 74914) self.assertEqual(new_transcripts[0].selected_cds_end, 74336) self.assertEqual(new_transcripts[1].selected_cds_start, 75804) self.assertEqual(new_transcripts[1].selected_cds_end, 75394)
class MonoHolderTester(unittest.TestCase): logger = create_default_logger("MonoHolderTester") def setUp(self): self.conf = dict() self.t1 = Transcript() self.t1.chrom = "Chr1" self.t1.strand = "+" self.t1.score = 20 self.t1.id = "G1.1" self.t1.parent = "G1" self.t1.start = 101 self.t1.end = 1500 self.t1.add_exons([(101, 500), (601, 700), (1001, 1300), (1401, 1500)], "exon") self.t1.add_exons([(401, 500), (601, 700), (1001, 1300), (1401, 1440)], "CDS") self.t1.finalize() def testCdsOverlap(self): t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G2.1" t2.parent = "G2" t2.start = 101 t2.end = 1600 t2.add_exons([(101, 500), (601, 700), (1001, 1300), (1401, 1460), (1501, 1600)], "exon") t2.add_exons([(401, 500), (601, 700), (1001, 1300), (1401, 1440)], "CDS") t2.finalize() self.assertTrue(MonosublocusHolder.is_intersecting(self.t1, t2)) def test_intronMatch(self): t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G2.1" t2.parent = "G2" t2.start = 101 t2.end = 1600 t2.add_exons([(101, 500), (601, 700), (1001, 1320), (1451, 1460), (1501, 1600)], "exon") t2.add_exons([(401, 500), (601, 700), (1001, 1320), (1451, 1460), (1501, 1510)], "CDS") t2.finalize() self.assertTrue(self.t1.is_coding) self.assertTrue(t2.is_coding) self.assertTrue( MonosublocusHolder.is_intersecting(self.t1, t2, logger=self.logger)) self.assertTrue( MonosublocusHolder.is_intersecting(self.t1, t2, cds_only=True, logger=self.logger)) def test_intronOverlap(self): self.t1.strip_cds() t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G2.1" t2.parent = "G2" t2.start = 101 t2.end = 1470 t2.add_exons([(101, 510), (601, 700), (960, 1350), (1420, 1470)]) t2.finalize() self.assertTrue(MonosublocusHolder.is_intersecting(self.t1, t2)) def test_noIntronOverlap(self): self.t1.strip_cds() t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G2.1" t2.parent = "G2" t2.start = 1250 t2.end = 2000 t2.add_exons([(1250, 1560), (1800, 2000)]) t2.finalize() self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2)) def test_noCDSOverlap(self): self.t1.strip_cds() self.assertEqual(self.t1.combined_cds_introns, set()) self.t1.finalized = False self.t1.add_exons([(401, 500), (601, 700), (1001, 1100)], "CDS") self.t1.finalize() t2 = Transcript() t2.logger = self.logger t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G2.1" t2.parent = "G2" t2.start = 101 t2.end = 1470 t2.add_exons([(101, 510), (601, 700), (960, 1350), (1421, 1470)]) t2.add_exons([(1201, 1350), (1421, 1450)], "CDS") t2.finalize() self.assertTrue(self.t1.is_coding) self.assertTrue(t2.is_coding) self.assertGreaterEqual( 0, overlap((self.t1.combined_cds_start, self.t1.combined_cds_end), (t2.combined_cds_start, t2.combined_cds_end)), [(self.t1.combined_cds_start, self.t1.combined_cds_end), (t2.combined_cds_start, t2.combined_cds_end)]) self.assertTrue( MonosublocusHolder.is_intersecting(self.t1, t2, logger=self.logger)) self.assertFalse( MonosublocusHolder.is_intersecting(self.t1, t2, cds_only=True, logger=self.logger)) def test_only_CDS_overlap(self): t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G2.1" t2.parent = "G2" t2.start = 1250 t2.end = 2000 t2.add_exons([(1250, 1560), (1801, 2000)]) t2.add_exons([(1401, 1560), (1801, 1850)], "CDS") t2.finalize() self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2)) t2.strip_cds() t2.finalized = False t2.add_exons([(1461, 1560), (1801, 1850)], "CDS") # No CDS overlap this time self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2)) def test_no_overlap(self): t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G2.1" t2.parent = "G2" t2.start = 1600 t2.end = 2000 t2.add_exons([(1600, 1700), (1801, 2000)]) t2.add_exons([(1661, 1700), (1801, 1850)], "CDS") t2.finalize() self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2)) def test_same_id(self): t2 = Transcript() t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G1.1" t2.parent = "G1" t2.start = 1250 t2.end = 2000 t2.add_exons([(1250, 1560), (1801, 2000)]) t2.add_exons([(1401, 1560), (1801, 1850)], "CDS") t2.finalize() # This fails because they have the same ID self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2))
def main(): parser = argparse.ArgumentParser(__doc__) parser.add_argument("--bed12", nargs=2, required=True, help="Transcriptomic cDNAs BED12s") parser.add_argument("--cdnas", nargs=2, required=True) parser.add_argument("-gf", help="GFF3/BED12 of the transferred annotation.", required=True) parser.add_argument("--out", default=sys.stdout, type=argparse.FileType("wt")) parser.add_argument("-ob", "--out-bed", dest="out_bed", required=False, default=None, type=argparse.FileType("wt")) log = parser.add_mutually_exclusive_group() log.add_argument("-q", "--quiet", default=False, action="store_true") log.add_argument("-v", "--verbose", default=False, action="store_true") parser.add_argument("-p", "--processes", type=int, default=mp.cpu_count()) args = parser.parse_args() logger = create_default_logger("master") verbosity = "INFO" if args.verbose is True: verbosity = "DEBUG" elif args.quiet is True: verbosity = "WARNING" listener = logging.handlers.QueueListener(logging_queue, logger) listener.propagate = False listener.start() logger.setLevel(verbosity) cdnas = dict() beds = dict() beds["ref"] = dict() beds["target"] = dict() gmap_pat = re.compile("\.mrna[0-9]*$") logger.info("Loading reference cDNAS") cdnas["ref"] = pyfaidx.Fasta(args.cdnas[0]) logger.info("Loading target cDNAS") cdnas["target"] = pyfaidx.Fasta(args.cdnas[1]) logger.info("Loaded cDNAs") logger.info("Loading reference BED12") for entry in Bed12Parser(args.bed12[0], transcriptomic=True): if entry.header: continue name = entry.chrom if name in beds["ref"]: raise KeyError("Duplicated ID for the reference: {}".format(name)) if name not in cdnas["ref"]: raise KeyError("Reference {} not found in the cDNAs!".format(name)) beds["ref"][name] = entry logger.info("Loading target BED12") beds["target"] = defaultdict(dict) for entry in Bed12Parser(args.bed12[1], transcriptomic=True): # Now, here we have to account for the fact that there *might* be multiple alignments name = re.sub(gmap_pat, "", entry.chrom) if entry.chrom not in cdnas["target"]: raise KeyError("Target {} not found in the cDNAs!".format( entry.chrom)) beds["target"][name][entry.chrom] = entry logger.info("Loaded BED12s") # Now let us start parsing the GFF3, which we presume being a GMAP GFF3 transcript = None logger.info("Launching sub-processes") procs = [] queue = mp.Queue(-1) for proc in range(args.processes): sq = tempfile.NamedTemporaryFile(mode="wb") sq.close() sq = sq.name _proc = Transferer(sq, queue, verbosity=verbosity) _proc.start() procs.append(_proc) logger.info("Launched sub-processes, starting parsing annotation") # pool = mp.Pool(processes=args.processes) tnum = -1 if args.gf.endswith(("bed12", "bed")): parser = Bed12Parser(args.gf, transcriptomic=False) for line in parser: if line.header: continue else: transcript = Transcript(line) tid = re.sub(gmap_pat, "", transcript.id) logger.debug("Found %s", tid) ref_cdna = str(cdnas["ref"][tid]) ref_bed = beds["ref"][tid] target_cdna = str(cdnas["target"][transcript.id]) target_bed = beds["target"][tid][transcript.id] tnum += 1 logger.debug("Submitting %s", tid) queue.put((tnum, (transcript, ref_cdna, ref_bed, target_cdna, target_bed))) if tnum >= 10**4 and tnum % 10**4 == 0: logger.info("Parsed {} transcripts", tnum) logger.info("Finished parsing input genomic BED file") else: parser = to_gff(args.gf) for pos, line in enumerate(parser): if line.header is True: # or (not isinstance(line, BED12) and line.is_gene is True): if str(line) == "###": continue try: print(line, file=args.out) except IndexError: raise IndexError(line._line) continue elif not isinstance(line, BED12) and line.is_gene is True: continue elif line.is_transcript is True: if transcript: if transcript.alias is None: tid = re.sub(gmap_pat, "", transcript.id) else: tid = re.sub(gmap_pat, "", transcript.alias) ref_cdna = str(cdnas["ref"][tid]) ref_bed = beds["ref"][tid] target_cdna = str(cdnas["target"][transcript.id]) store = beds["target"].get(tid, None) if store is None: raise KeyError((tid, beds["target"].keys())) target_bed = store.get(transcript.id, None) if target_bed is None: raise KeyError((tid, store.keys())) tnum += 1 queue.put((tnum, (transcript, ref_cdna, ref_bed, target_cdna, target_bed))) try: transcript = Transcript(line) except (ValueError, TypeError): raise ValueError((pos, line)) elif line.is_exon is True: transcript.add_exon(line) if tnum >= 10**4 and tnum % 10**4 == 0: logger.info("Parsed {} transcripts", tnum) if transcript: tnum += 1 tid = re.sub(gmap_pat, "", transcript.id) ref_cdna = str(cdnas["ref"][tid]) ref_bed = beds["ref"][tid] target_cdna = str(cdnas["target"][transcript.id]) target_bed = beds["target"][tid][transcript.id] queue.put((tnum, (transcript, ref_cdna, ref_bed, target_cdna, target_bed))) logger.info("Finished parsing input genomic GF file") queue.put("EXIT") logger.info("Waiting for subprocesses to finish") [_proc.join() for _proc in procs] # Now the printing ... # results = dict() logger.info("Subprocesses finished, printing") for proc in procs: sq = sqlalchemy.create_engine("sqlite:///{}".format(proc.out_sq)) for res in sq.execute("select * from storer"): num, bed12, gff3 = res if args.out_bed is not None: print(bed12.decode(), file=args.out_bed) print(*gff3.decode().split("\n"), file=args.out, sep="\n") os.remove(proc.out_sq) logger.info("Finished!") return
def main(): logger = create_default_logger("stat_serializer") parser = argparse.ArgumentParser(__doc__) parser.add_argument("--db", required=True, help="SQLite database to connect to.") parser.add_argument("--force", action="store_true", default=False) parser.add_argument( "input_files", help= """TXT tab-delimited file, specifying the input files in the following way: - species - aligner - assembler - basename for the comparisons against the complete reference - basename for the comparisons against the filtered reference """) args = parser.parse_args() # Create the database connector = functools.partial(sqlite3.connect, database=args.db, check_same_thread=False) engine = create_engine("sqlite://", creator=connector) if args.force is True: logger.warn("Removing old data because force option in place") meta = sqlalchemy.MetaData(bind=engine) meta.reflect(engine) for tab in reversed(meta.sorted_tables): logger.warn("Dropping %s", tab) tab.drop() inspector = Inspector.from_engine(engine) Session = sessionmaker(bind=engine) # session = Session(bind=engine, autocommit=True, autoflush=True) session = Session() if Indexer.__tablename__ not in inspector.get_table_names(): DBBASE.metadata.create_all(engine) # @UndefinedVariable with open(args.input_files) as input_files: for row in input_files: species, aligner, assembler, complete, filtered = row.rstrip( ).split() if not os.path.exists("{}.stats".format(complete)): raise ValueError( "Original file not found; line:\n{}".format(row)) if not os.path.exists("{}.stats".format(filtered)): raise ValueError( "Filtered file {} not found; line:\n{}".format( "{}.stats".format(filtered), row)) current_species = Indexer(species, aligner, assembler) session.add(current_species) session.commit() print(current_species.m_index) complete_load = CompareFiles(current_species.m_index, complete, filtered=False) session.add(complete_load) # session.commit() # filtered_load = CompareFiles(current_species.index, filtered, filtered=True) # session.add(filtered_load) # session.commit() # # orig_lines = [line.rstrip() for line in open(orig)] # filtered_lines = [line.rstrip() for line in open(filtered)] # # In the stats we have precision as second and sensitivity as first, # # we have to invert # for index, line_index in enumerate([5, 7, 8, 9, 12, 15]): # precision = float(orig_lines[line_index].split(":")[1].split()[1]) # recall = float(filtered_lines[line_index].split(":")[1].split()[0]) # stats[ # # Name of the statistic:Base, Exon, etc # list(stats.keys())[index]][ # b"TopHat"].append((precision, recall)) session.commit()
class TestMetricsEndDistances(unittest.TestCase): logger = create_default_logger("End") logger.setLevel("ERROR") def setUp(self): self.tr = Transcript() self.tr.logger = self.logger self.tr.start = 101 self.tr.end = 10000 self.tr.add_exons([(101, 300), (501, 800), (1001, 1200), (1301, 2000), (3501, 5000), (5501, 6000), (6201, 7000), (7301, 7700), (8201, 9000), (9101, 9300), (9501, 9700), (9801, 10000)]) self.tr.id = "test1" self.tr.parent = "test1.gene" def test_end_positive(self): self.tr.strand = "+" cds = [(1161, 1200), # 40 % 3 == 1 (1301, 2000), # 700 % 3 == 1 (3501, 5000), # 1500 % 3 == 0 (5501, 6000), # 500 % 3 == 2 (6201, 7000), # 800 % 3 == 2 (7301, 7700), # 400 % 3 == 1 (8201, 9000), # 800 % 3 == 2 (9101, 9130)] self.tr.add_exons(cds, features="CDS") self.tr.finalize() self.assertEqual(self.tr.selected_cds_end, self.tr.combined_cds_end) self.assertEqual(self.tr.selected_cds_end, 9130) self.assertEqual(self.tr.end_distance_from_junction, (9300 - 9131 + 1) + (9700 - 9501 + 1) ) self.assertEqual(self.tr.end_distance_from_tes, (9300 - 9131 + 1) + (9700 - 9501 + 1) + (10000 - 9801 + 1) ) self.tr.strip_cds() self.assertEqual(len(self.tr.internal_orfs), 0, self.tr.internal_orfs) self.tr.finalized = False cds = [(1161, 1200), # 40 % 3 == 1 (1301, 2000), # 700 % 3 == 1 (3501, 5000), # 1500 % 3 == 0 (5501, 6000), # 500 % 3 == 2 (6201, 7000), # 800 % 3 == 2 (7301, 7700), # 400 % 3 == 1 (8201, 9000), # 800 % 3 == 2 (9101, 9300), # 200 % 3 == 2 (9501, 9690) # 190 % 3 == 1 ] self.tr.add_exons(cds, features="CDS") self.tr.finalize() self.assertEqual(self.tr.combined_cds_end, 9690) self.assertEqual(self.tr.selected_cds_end, self.tr.combined_cds_end) self.assertEqual(self.tr.end_distance_from_junction, (9700 - 9691 + 1) ) self.assertEqual(self.tr.end_distance_from_tes, (9700 - 9691 + 1) + (10000 - 9801 + 1) ) self.tr.strip_cds() self.assertEqual(self.tr.combined_cds_end, self.tr.selected_cds_end, self.tr.combined_cds) self.assertEqual(self.tr.combined_cds_end, None, self.tr.combined_cds_end) self.tr.finalized = False cds = [(1161, 1200), # 40 % 3 == 1 (1301, 2000), # 700 % 3 == 1 (3501, 5000), # 1500 % 3 == 0 (5501, 6000), # 500 % 3 == 2 (6201, 7000), # 800 % 3 == 2 (7301, 7700), # 400 % 3 == 1 (8201, 9000), # 800 % 3 == 2 (9101, 9300), # 200 % 3 == 2 (9501, 9700), # 200 % 3 == 2 (9801, 9820), # 20 % 2 == 2 ] self.tr.add_exons(cds, features="CDS") self.tr.finalize() self.assertEqual(self.tr.combined_cds_end, 9820) self.assertEqual(self.tr.selected_cds_end, self.tr.combined_cds_end) self.assertEqual(self.tr.end_distance_from_tes, 180) self.assertEqual(self.tr.end_distance_from_junction, 0) def test_end_negative(self): self.tr.strand = "-" # self.tr.add_exons([(101, 300), # (501, 800), # (1001, 1200), # (1301, 2000), # (3501, 5000), # (5501, 6000), # (6201, 7000), # (7301, 7700), # (8201, 9000), # (9101, 9300), # (9501, 9700), # (9801, 10000)]) cds = [(1161, 1200), # 40 % 3 == 1 (1301, 2000), # 700 % 3 == 1 (3501, 5000), # 1500 % 3 == 0 (5501, 6000), # 500 % 3 == 2 (6201, 7000), # 800 % 3 == 2 (7301, 7700), # 400 % 3 == 1 (8201, 9000), # 800 % 3 == 2 (9101, 9130)] self.assertEqual(sum(x[1] - x[0] + 1 for x in cds) % 3, 0) self.tr.add_exons(cds, features="CDS") self.tr.finalize() self.assertTrue(self.tr.is_coding) self.assertEqual(self.tr.selected_cds_end, self.tr.combined_cds_end) self.assertEqual(self.tr.selected_cds_end, 1161) self.assertEqual(self.tr.end_distance_from_junction, (1161-1001) + (800-501+1), (self.tr.end_distance_from_junction, (1161-1001) + (800-501+1)) ) self.assertEqual(self.tr.end_distance_from_tes, self.tr.end_distance_from_junction + (300 - 101 + 1), (self.tr.end_distance_from_tes, self.tr.end_distance_from_junction + (300 - 101 + 1)) ) self.tr.strip_cds() self.assertEqual(len(self.tr.internal_orfs), 0, self.tr.internal_orfs) self.tr.finalized = False cds = [(721, 800), (1001, 1200), # 200 % 3 == 2 (1301, 2000), # 700 % 3 == 1 (3501, 5000), # 1500 % 3 == 0 (5501, 6000), # 500 % 3 == 2 (6201, 7000), # 800 % 3 == 2 (7301, 7700), # 400 % 3 == 1 (8201, 9000), # 800 % 3 == 2 (9101, 9130), # 200 % 3 == 2 ] self.tr.add_exons(cds, features="CDS") self.tr.finalize() self.assertEqual(self.tr.combined_cds_end, 721) self.assertEqual(self.tr.selected_cds_end, self.tr.combined_cds_end) self.assertEqual(self.tr.end_distance_from_junction, (721-501), (self.tr.end_distance_from_junction, (721-501)) ) self.assertEqual(self.tr.end_distance_from_tes, self.tr.end_distance_from_junction + (300 - 101 + 1), (self.tr.end_distance_from_tes, self.tr.end_distance_from_junction + (300 - 101 + 1)) ) self.tr.strip_cds() self.assertEqual(self.tr.combined_cds_end, self.tr.selected_cds_end, self.tr.combined_cds) self.assertEqual(self.tr.combined_cds_end, None, self.tr.combined_cds_end) self.tr.finalized = False cds = [(161, 300), # 140 % 3 == 2 (501, 800), # 300 % 3 == 0 (1001, 1200), # 200 % 3 == 2 (1301, 2000), # 700 % 3 == 1 (3501, 5000), # 1500 % 3 == 0 (5501, 6000), # 500 % 3 == 2 (6201, 7000), # 800 % 3 == 2 (7301, 7700), # 400 % 3 == 1 (8201, 9000), # 800 % 3 == 2 (9101, 9130), # 30 % 3 == 0 ] self.assertEqual(sum((_[1] - _[0] +1) % 3 for _ in cds ) % 3, 0) self.tr.logger = self.logger self.tr.add_exons(cds, features="CDS") self.tr.finalize() self.assertEqual(self.tr.combined_cds_end, 161) self.assertEqual(self.tr.selected_cds_end, self.tr.combined_cds_end) self.assertEqual(self.tr.end_distance_from_tes, 60) self.assertEqual(self.tr.end_distance_from_junction, 0)
class AugustusTester(unittest.TestCase): logger = create_default_logger("augustus") logger.setLevel("DEBUG") def test_truncated(self): lines = """Triticum_aestivum_CS42_TGACv1_scaffold_000043_1AL Triticum_aestivum_CS42_TGACv1_TRIAE4565_Augustus mRNA 1 2785 . + . ID=TRIAE4565_1AL_Aug_0021880.1;Parent=TRIAE4565_1AL_Aug_0021880;Name=TRIAE4565_1AL_Aug_0021880.1 Triticum_aestivum_CS42_TGACv1_scaffold_000043_1AL Triticum_aestivum_CS42_TGACv1_TRIAE4565_Augustus CDS 1601 2446 . + 1 ID=TRIAE4565_1AL_Aug_0021880.1.CDS1;Parent=TRIAE4565_1AL_Aug_0021880.1 Triticum_aestivum_CS42_TGACv1_scaffold_000043_1AL Triticum_aestivum_CS42_TGACv1_TRIAE4565_Augustus exon 1601 2446 . + . ID=TRIAE4565_1AL_Aug_0021880.1.exon1;Parent=TRIAE4565_1AL_Aug_0021880.1 Triticum_aestivum_CS42_TGACv1_scaffold_000043_1AL Triticum_aestivum_CS42_TGACv1_TRIAE4565_Augustus CDS 2540 2654 . + 1 ID=TRIAE4565_1AL_Aug_0021880.1.CDS2;Parent=TRIAE4565_1AL_Aug_0021880.1 Triticum_aestivum_CS42_TGACv1_scaffold_000043_1AL Triticum_aestivum_CS42_TGACv1_TRIAE4565_Augustus exon 2540 2785 . + . ID=TRIAE4565_1AL_Aug_0021880.1.exon2;Parent=TRIAE4565_1AL_Aug_0021880.1 Triticum_aestivum_CS42_TGACv1_scaffold_000043_1AL Triticum_aestivum_CS42_TGACv1_TRIAE4565_Augustus three_prime_UTR 2655 2785 . + . ID=TRIAE4565_1AL_Aug_0021880.1.three_prime_UTR1;Parent=TRIAE4565_1AL_Aug_0021880.1""" lines = [ parsers.GFF.GffLine("\t".join(_.split())) for _ in lines.split("\n") ] transcript = loci.Transcript(lines[0], logger=self.logger) transcript.add_exons(lines[1:]) with self.assertLogs("augustus", level="WARNING") as cm_out: transcript.finalize() self.assertTrue( any("The transcript TRIAE4565_1AL_Aug_0021880.1 has coordinates 1:2785" in _ for _ in cm_out.output)) self.assertTrue(transcript.is_coding) def test_three_truncated(self): lines = """Triticum_aestivum_CS42_TGACv1_scaffold_000112_1AL Triticum_aestivum_CS42_TGACv1_TRIAE4565_Augustus mRNA 204336 224434 . + . ID=TRIAE4565_1AL_Aug_0024630.1;Parent=TRIAE4565_1AL_Aug_0024630;Name=TRIAE4565_1AL_Aug_0024630.1 Triticum_aestivum_CS42_TGACv1_scaffold_000112_1AL Triticum_aestivum_CS42_TGACv1_TRIAE4565_Augustus exon 204336 205303 . + . ID=TRIAE4565_1AL_Aug_0024630.1.exon1;Parent=TRIAE4565_1AL_Aug_0024630.1 Triticum_aestivum_CS42_TGACv1_scaffold_000112_1AL Triticum_aestivum_CS42_TGACv1_TRIAE4565_Augustus five_prime_UTR 204336 204546 . + . ID=TRIAE4565_1AL_Aug_0024630.1.five_prime_UTR1;Parent=TRIAE4565_1AL_Aug_0024630.1 Triticum_aestivum_CS42_TGACv1_scaffold_000112_1AL Triticum_aestivum_CS42_TGACv1_TRIAE4565_Augustus CDS 204547 205303 . + 0 ID=TRIAE4565_1AL_Aug_0024630.1.CDS1;Parent=TRIAE4565_1AL_Aug_0024630.1 Triticum_aestivum_CS42_TGACv1_scaffold_000112_1AL Triticum_aestivum_CS42_TGACv1_TRIAE4565_Augustus CDS 206227 207040 . + 2 ID=TRIAE4565_1AL_Aug_0024630.1.CDS2;Parent=TRIAE4565_1AL_Aug_0024630.1 Triticum_aestivum_CS42_TGACv1_scaffold_000112_1AL Triticum_aestivum_CS42_TGACv1_TRIAE4565_Augustus exon 206227 207040 . + . ID=TRIAE4565_1AL_Aug_0024630.1.exon2;Parent=TRIAE4565_1AL_Aug_0024630.1""" lines = [ parsers.GFF.GffLine("\t".join(_.split())) for _ in lines.split("\n") ] transcript = loci.Transcript(lines[0], logger=self.logger) transcript.add_exons(lines[1:]) with self.assertLogs("augustus", level="WARNING") as cm_out: transcript.finalize() self.assertTrue( any("The transcript TRIAE4565_1AL_Aug_0024630.1 has coordinates 204336:224434" in _ for _ in cm_out.output)) self.assertTrue(transcript.is_coding) def test_invalid_three_truncated(self): lines = """Triticum_aestivum_CS42_TGACv1_scaffold_000112_1AL Triticum_aestivum_CS42_TGACv1_TRIAE4565_Augustus mRNA 204336 225434 . + . ID=TRIAE4565_1AL_Aug_0024630.1;Parent=TRIAE4565_1AL_Aug_0024630;Name=TRIAE4565_1AL_Aug_0024630.1 Triticum_aestivum_CS42_TGACv1_scaffold_000112_1AL Triticum_aestivum_CS42_TGACv1_TRIAE4565_Augustus exon 204336 205303 . + . ID=TRIAE4565_1AL_Aug_0024630.1.exon1;Parent=TRIAE4565_1AL_Aug_0024630.1 Triticum_aestivum_CS42_TGACv1_scaffold_000112_1AL Triticum_aestivum_CS42_TGACv1_TRIAE4565_Augustus five_prime_UTR 204336 204546 . + . ID=TRIAE4565_1AL_Aug_0024630.1.five_prime_UTR1;Parent=TRIAE4565_1AL_Aug_0024630.1 Triticum_aestivum_CS42_TGACv1_scaffold_000112_1AL Triticum_aestivum_CS42_TGACv1_TRIAE4565_Augustus CDS 204547 205303 . + 0 ID=TRIAE4565_1AL_Aug_0024630.1.CDS1;Parent=TRIAE4565_1AL_Aug_0024630.1 Triticum_aestivum_CS42_TGACv1_scaffold_000112_1AL Triticum_aestivum_CS42_TGACv1_TRIAE4565_Augustus CDS 206227 207042 . + 2 ID=TRIAE4565_1AL_Aug_0024630.1.CDS2;Parent=TRIAE4565_1AL_Aug_0024630.1 Triticum_aestivum_CS42_TGACv1_scaffold_000112_1AL Triticum_aestivum_CS42_TGACv1_TRIAE4565_Augustus exon 206227 207042 . + . ID=TRIAE4565_1AL_Aug_0024630.1.exon2;Parent=TRIAE4565_1AL_Aug_0024630.1 Triticum_aestivum_CS42_TGACv1_scaffold_000112_1AL Triticum_aestivum_CS42_TGACv1_TRIAE4565_Augustus exon 208227 210040 . + . ID=TRIAE4565_1AL_Aug_0024630.1.exon2;Parent=TRIAE4565_1AL_Aug_0024630.1""" lines = [ parsers.GFF.GffLine("\t".join(_.split())) for _ in lines.split("\n") ] transcript = loci.Transcript(lines[0], logger=self.logger) transcript.add_exons(lines[1:]) with self.assertLogs("augustus", level="WARNING") as cm_out: transcript.finalize() self.assertTrue( any("The transcript TRIAE4565_1AL_Aug_0024630.1 has coordinates 204336:225434" in _ for _ in cm_out.output)) # self.assertTrue(any( # "strip_cds" in _ for # _ in cm_out.output)) self.assertFalse(transcript.is_coding) def test_valid_three_truncated(self): """ Picked from the EnsEMBL Human GTF (v. 70) :return: """ lines = """11\tnonsense_mediated_decay\texon\t134177086\t134177102\t.\t+\t.\tgene_id "ENSG00000166105"; transcript_id "ENST00000455971"; exon_number "1"; gene_name "GLB1L3"; gene_biotype "protein_coding"; transcript_name "GLB1L3-006"; exon_id "ENSE00002461794"; 11\tnonsense_mediated_decay\tCDS\t134177086\t134177102\t.\t+\t2\tgene_id "ENSG00000166105"; transcript_id "ENST00000455971"; exon_number "1"; gene_name "GLB1L3"; gene_biotype "protein_coding"; transcript_name "GLB1L3-006"; protein_id "ENSP00000397929"; 11\tnonsense_mediated_decay\texon\t134179522\t134179657\t.\t+\t.\tgene_id "ENSG00000166105"; transcript_id "ENST00000455971"; exon_number "2"; gene_name "GLB1L3"; gene_biotype "protein_coding"; transcript_name "GLB1L3-006"; exon_id "ENSE00002147723"; 11\tnonsense_mediated_decay\tCDS\t134179522\t134179657\t.\t+\t0\tgene_id "ENSG00000166105"; transcript_id "ENST00000455971"; exon_number "2"; gene_name "GLB1L3"; gene_biotype "protein_coding"; transcript_name "GLB1L3-006"; protein_id "ENSP00000397929"; 11\tnonsense_mediated_decay\texon\t134180465\t134180545\t.\t+\t.\tgene_id "ENSG00000166105"; transcript_id "ENST00000455971"; exon_number "3"; gene_name "GLB1L3"; gene_biotype "protein_coding"; transcript_name "GLB1L3-006"; exon_id "ENSE00001278318"; 11\tnonsense_mediated_decay\tCDS\t134180465\t134180545\t.\t+\t2\tgene_id "ENSG00000166105"; transcript_id "ENST00000455971"; exon_number "3"; gene_name "GLB1L3"; gene_biotype "protein_coding"; transcript_name "GLB1L3-006"; protein_id "ENSP00000397929"; 11\tnonsense_mediated_decay\texon\t134180958\t134181064\t.\t+\t.\tgene_id "ENSG00000166105"; transcript_id "ENST00000455971"; exon_number "4"; gene_name "GLB1L3"; gene_biotype "protein_coding"; transcript_name "GLB1L3-006"; exon_id "ENSE00001140726"; 11\tnonsense_mediated_decay\tCDS\t134180958\t134181064\t.\t+\t2\tgene_id "ENSG00000166105"; transcript_id "ENST00000455971"; exon_number "4"; gene_name "GLB1L3"; gene_biotype "protein_coding"; transcript_name "GLB1L3-006"; protein_id "ENSP00000397929"; 11\tnonsense_mediated_decay\texon\t134182243\t134182383\t.\t+\t.\tgene_id "ENSG00000166105"; transcript_id "ENST00000455971"; exon_number "5"; gene_name "GLB1L3"; gene_biotype "protein_coding"; transcript_name "GLB1L3-006"; exon_id "ENSE00003177017"; 11\tnonsense_mediated_decay\tCDS\t134182243\t134182383\t.\t+\t0\tgene_id "ENSG00000166105"; transcript_id "ENST00000455971"; exon_number "5"; gene_name "GLB1L3"; gene_biotype "protein_coding"; transcript_name "GLB1L3-006"; protein_id "ENSP00000397929"; 11\tnonsense_mediated_decay\texon\t134182710\t134182781\t.\t+\t.\tgene_id "ENSG00000166105"; transcript_id "ENST00000455971"; exon_number "6"; gene_name "GLB1L3"; gene_biotype "protein_coding"; transcript_name "GLB1L3-006"; exon_id "ENSE00003096760"; 11\tnonsense_mediated_decay\tCDS\t134182710\t134182781\t.\t+\t0\tgene_id "ENSG00000166105"; transcript_id "ENST00000455971"; exon_number "6"; gene_name "GLB1L3"; gene_biotype "protein_coding"; transcript_name "GLB1L3-006"; protein_id "ENSP00000397929"; 11\tnonsense_mediated_decay\texon\t134183835\t134183922\t.\t+\t.\tgene_id "ENSG00000166105"; transcript_id "ENST00000455971"; exon_number "7"; gene_name "GLB1L3"; gene_biotype "protein_coding"; transcript_name "GLB1L3-006"; exon_id "ENSE00003040614"; 11\tnonsense_mediated_decay\tCDS\t134183835\t134183837\t.\t+\t0\tgene_id "ENSG00000166105"; transcript_id "ENST00000455971"; exon_number "7"; gene_name "GLB1L3"; gene_biotype "protein_coding"; transcript_name "GLB1L3-006"; protein_id "ENSP00000397929"; 11\tnonsense_mediated_decay\tstop_codon\t134183838\t134183840\t.\t+\t0\tgene_id "ENSG00000166105"; transcript_id "ENST00000455971"; exon_number "7"; gene_name "GLB1L3"; gene_biotype "protein_coding"; transcript_name "GLB1L3-006"; 11\tnonsense_mediated_decay\texon\t134184224\t134184335\t.\t+\t.\tgene_id "ENSG00000166105"; transcript_id "ENST00000455971"; exon_number "8"; gene_name "GLB1L3"; gene_biotype "protein_coding"; transcript_name "GLB1L3-006"; exon_id "ENSE00003002659"; 11\tnonsense_mediated_decay\texon\t134188525\t134188641\t.\t+\t.\tgene_id "ENSG00000166105"; transcript_id "ENST00000455971"; exon_number "9"; gene_name "GLB1L3"; gene_biotype "protein_coding"; transcript_name "GLB1L3-006"; exon_id "ENSE00003191545"; 11\tnonsense_mediated_decay\texon\t134188771\t134189178\t.\t+\t.\tgene_id "ENSG00000166105"; transcript_id "ENST00000455971"; exon_number "10"; gene_name "GLB1L3"; gene_biotype "protein_coding"; transcript_name "GLB1L3-006"; exon_id "ENSE00001441085";""" lines = [ parsers.GTF.GtfLine("\t".join(_.split("\t"))) for _ in lines.split("\n") ] assert all([line.header is False for line in lines]) transcript = loci.Transcript(lines[0], logger=self.logger) transcript.add_exons(lines[1:]) with self.assertLogs("augustus", level="DEBUG") as cm_out: transcript.finalize() self.assertTrue(transcript.is_coding) self.assertEqual( 560, transcript.selected_cds_length, sum(_[1][1] - _[1][0] + 1 for _ in transcript.selected_internal_orf if _[0] == "CDS"))
class PhaseChecker(unittest.TestCase): logger = create_default_logger("pcheck") logger.setLevel("DEBUG") def setUp(self): lines = """Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 mRNA 40282 46004 . - . ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960;Name=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2;aed=0.0;note=TRIAE_CS42_5DL_TGACv1_434051_AA1427960;confidence=High;has_start=True;has_stop=True;original_stop=True;protein_rank=P1;transcript_rank=T2 Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 exon 40282 40933 . - . ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon1;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2 Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 three_prime_UTR 40282 40720 . - . ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.three_prime_UTR1;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2 Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 CDS 40721 40933 . - 0 ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS1;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2 Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 CDS 41018 41111 . - 1 ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS2;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2 Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 exon 41018 41111 . - . ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon2;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2 Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 CDS 41227 41468 . - 0 ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS3;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2 Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 exon 41227 41468 . - . ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon3;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2 Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 CDS 41673 41831 . - 0 ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS4;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2 Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 exon 41673 41831 . - . ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon4;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2 Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 CDS 41946 42820 . - 2 ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS5;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2 Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 exon 41946 42820 . - . ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon5;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2 Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 CDS 42905 42913 . - 2 ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS6;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2 Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 exon 42905 42913 . - . ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon6;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2 Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 CDS 45373 45496 . - 0 ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS7;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2 Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 exon 45373 45496 . - . ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon7;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2 Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 CDS 45600 45651 . - 1 ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS8;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2 Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 exon 45600 45651 . - . ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon8;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2 Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 CDS 45726 45726 . - 2 ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS9;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2 Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 exon 45726 45726 . - . ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon9;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2 Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 CDS 45875 45893 . - 0 ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS10;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2 Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 exon 45875 46004 . - . ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon10;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2 Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL TGACv1 five_prime_UTR 45894 46004 . - . ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.five_prime_UTR1;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2""" lines = [GffLine("\t".join(_.split())) for _ in lines.split("\n") if _] self.transcript = Transcript(lines[0], logger=self.logger) self.transcript.add_exons(lines[1:]) self.correct_phases = {(40721, 40933): 2, (41018, 41111): 0, (41227, 41468): 2, (41673, 41831): 2, (41946, 42820): 1, (42905, 42913): 1, (45373, 45496): 2, (45600, 45651): 0, (45726, 45726): 2, (45875, 45893): 0} @unittest.skip def test_check_phases(self): self.transcript.finalize() phases = dict((_[1], _[2]) for _ in self.transcript.internal_orfs[0] if _[0] == "CDS") self.assertEqual(self.transcript.combined_cds_start, 45893) self.assertEqual(phases.keys(), self.correct_phases.keys(), list(zip(sorted(phases.keys()), sorted(self.correct_phases.keys())))) if self.correct_phases != phases: for key in sorted(phases.keys(), reverse=True): self.assertEqual(phases[key], self.correct_phases[key], (key, phases[key], self.correct_phases[key])) self.assertEqual(self.correct_phases, phases, (self.correct_phases, phases))