def test_bam2gff(self): name_counter = Counter() transcripts = [] bam_file = pkg_resources.resource_filename("Mikado.tests", "test.reads.bam") for record in AlignmentFile(bam_file, mode="rb"): if record.is_unmapped is True: continue transcript = Transcript(record) if name_counter.get(record.query_name): name = "{}_{}".format(record.query_name, name_counter.get(record.query_name)) else: name = record.query_name if name != transcript.id: transcript.alias = transcript.id transcript.id = name transcript.parent = transcript.attributes[ "gene_id"] = "{0}.gene".format(name) name_counter.update([record.query_name]) transcript.source = "bam2gtf" transcripts.append(transcript) self.assertEqual(len(transcripts), 4) self.assertEqual(transcripts[0].strand, '-') self.assertEqual(transcripts[1].strand, '+') self.assertEqual(transcripts[2].strand, '+') self.assertEqual(transcripts[3].strand, '-')
def test_load_orfs(self): transcript_line = 'Chr1\t100\t2000\tID=foo;coding=True;phase=0'\ '\t0\t+\t300\t1850\t0\t4\t400,400,400,200\t0,500,1100,1700' transcript = Transcript(transcript_line) orf = transcript.orfs[0].to_transcriptomic() transcript2 = transcript.copy() transcript2.unfinalize() transcript2.chrom = "Chr2" transcript2.id = "foo.2" transcript2.finalize() other_orf = transcript2.orfs[0].to_transcriptomic() engine = create_engine("sqlite:///:memory:") db.metadata.create_all(engine) SessionMaker = sessionmaker(bind=engine) session = SessionMaker() query = Query(transcript.id, transcript.cdna_length) query2 = Query(transcript2.id, transcript2.cdna_length) session.add_all([query, query2]) session.commit() serialized_orf = Orf(orf, query.query_id) self.assertEqual(serialized_orf.thick_end, orf.thick_end) self.assertEqual(serialized_orf.cds_len, orf.cds_len) serialized_other_orf = Orf(other_orf, query2.query_id) session.add_all([serialized_orf, serialized_other_orf]) session.commit() sup = Superlocus(transcript) sup.session = session sup_orfs = asyncio.run(sup.get_orfs([query.query_id])) self.assertEqual(len(sup_orfs), 1) self.assertIn(transcript.id, sup_orfs) self.assertEqual(len(sup_orfs[transcript.id]), 1) self.assertIsInstance(sup_orfs[transcript.id][0], BED12, type(sup_orfs[transcript.id][0])) self.assertTrue( sup_orfs[transcript.id][0] == orf, "\n" + "\n".join([str(orf), str(sup_orfs[transcript.id][0])]))
def main(): """ Main script function. """ parser = argparse.ArgumentParser( "Script to add a transcript feature to e.g. Cufflinks GTFs") parser.add_argument("gtf", type=argparse.FileType(), help="Input GTF") parser.add_argument("out", default=sys.stdout, nargs="?", type=argparse.FileType("w"), help="Output file. Default: stdout.") args = parser.parse_args() args.gtf.close() transcript_lines = defaultdict(list) [ transcript_lines[_.transcript].append(_) for _ in GTF(args.gtf.name) if _.header is False and _.is_exon is True ] args.gtf.close() transcripts = list() for tid in transcript_lines: transcript = Transcript(transcript_lines[tid][0]) transcript.add_exons(transcript_lines[tid]) transcripts.append(transcript) for transcript in sorted(transcripts): print(transcript.format("gtf"), file=args.out) if args.out is not sys.stdout: args.out.close()
def test_zero_one_many(self): from Mikado.transcripts import Transcript junctions = [] transcripts = [] with parsers.bed12.Bed12Parser(os.path.join( os.path.dirname(__file__), "zom_junctions.bed")) as parser: for line in parser: serializers.junction.JunctionSerializer.generate_introns(0, junctions, line) transcripts.append(Transcript(line)) bed_introns = [] for junction in junctions: bed_introns.append((junction.junction_start, junction.junction_end)) transcript_introns = [] for transcript in transcripts: for intron in transcript.introns: transcript_introns.append(intron) assert set(bed_introns) == set(transcript_introns), (set(bed_introns), set(transcript_introns))
def main(): parser = argparse.ArgumentParser(__doc__) parser.add_argument("-f", "--format", default=None, choices=["gff3", "gtf"]) parser.add_argument("gff", type=parser_factory) parser.add_argument("out", nargs="?", default=sys.stdout, type=argparse.FileType("wt")) args = parser.parse_args() is_gff = (args.gff.file_format == "gff3") if args.format is None: args.format = args.gff.file_format tid2gid = dict() genes = OrderedDict() for row in args.gff: if row.header is True: continue elif row.is_gene is True: genes[row.id] = Gene(row) elif row.is_transcript is True: assert len(row.parent) == 1 parent = row.parent[0] tid2gid[row.id] = parent genes[parent].add(Transcript(row)) elif row.is_exon is True: if row.gene is None: gene = tid2gid[row.parent[0]] else: gene = row.gene genes[gene].add_exon(row) for gid, gene in genes.items(): print(strip_utr(gene).format(args.format), file=args.out) continue return
def test_get_external(self): checked_conf = load_and_validate_config(None).copy() checked_conf.pick.output_format.report_all_external_metrics = True transcript = Transcript() transcript.chrom = "15" transcript.source = "protein_coding" transcript.start = 47631264 transcript.end = 48051999 exons = [(47631264, 47631416), (47704590, 47704669), (47762671, 47762742), (47893062, 47893093), (47895572, 47895655), (48051942, 48051999)] transcript.strand = "+" transcript.add_exons(exons) transcript.id = "ENST00000560636" transcript.parent = "ENSG00000137872" transcript2 = transcript.copy() transcript2.id = "ENST00000560637" checked_conf.scoring.scoring["attributes.tpm"] = MinMaxScore.Schema( ).load({ "rescaling": "max", "default": 0, "rtype": "float", 'multiplier': 4, 'use_raw': True, 'percentage': True }) transcript.attributes["tpm"] = 10 int_source = ExternalSource('int', 'int', 0) float_source = ExternalSource('float', 'float', 0) bool_source = ExternalSource('bool', 'bool', 0) raw_int_source = ExternalSource('raw_int', 'int', 1) raw_float_source = ExternalSource('raw_float', 'float', 1) raw_bool_source = ExternalSource('raw_bool', 'bool', 1) int_score = External(1, 1, 10) float_score = External(1, 2, 10.0) bool_score = External( 1, 3, int(False) ) # We cast as int here following external.py serialize function raw_int_score = External(1, 4, 8) raw_float_score = External(1, 5, 8.0) raw_bool_score = External( 1, 6, int(True) ) # We cast as int here following external.py serialize function query = Query(transcript.id, transcript.cdna_length) query2 = Query(transcript2.id, transcript2.cdna_length) engine = create_engine("sqlite:///:memory:") db.metadata.create_all(engine) SessionMaker = sessionmaker(bind=engine) session = SessionMaker() session.add_all([ int_source, float_source, bool_source, raw_int_source, raw_float_source, raw_bool_source ]) session.add_all([query, query2]) session.add_all([ int_score, float_score, bool_score, raw_int_score, raw_float_score, raw_bool_score ]) session.commit() sup = Superlocus(transcript, configuration=checked_conf) sup.session = session tid = transcript.id self.assertIn(tid, sup.transcripts) from collections import namedtuple qobj = {1: namedtuple('t', field_names=('query_name'))} qobj[1].query_name = 'ENST00000560636' external = asyncio.run(sup.get_external(qobj, [1])) self.assertEqual( external, { 'ENST00000560636': { 'int': (10, False), 'float': (10.0, False), 'bool': (False, False), 'raw_int': (8, True), 'raw_float': (8.0, True), 'raw_bool': (True, True) } }) sup.configuration.pick.output_format.report_all_external_metrics = False external = asyncio.run(sup.get_external(qobj, [1])) self.assertEqual(len(external), 0) # These are meaningless it's just to verify we are loading *only* these metrics. # We should *NOT* have 'float' as it is not present in any section. sup.configuration.scoring.scoring["external.int"] = MinMaxScore( rescaling="max", filter=None) sup.configuration.scoring.requirements.parameters[ "external.raw_float"] = SizeFilter(operator="gt", value=100) sup.configuration.scoring.cds_requirements.parameters[ "external.raw_int"] = SizeFilter(operator="lt", value=1) sup.configuration.scoring.as_requirements.parameters[ "external.raw_bool"] = SizeFilter(operator="lt", value=1) sup.configuration.scoring.not_fragmentary.parameters[ "external.bool"] = SizeFilter(operator="ne", value=False) external = asyncio.run(sup.get_external(qobj, [1])) self.assertEqual( external, { 'ENST00000560636': { 'int': (10, False), 'raw_float': (8.0, True), 'bool': (False, False), 'raw_int': (8, True), 'raw_bool': (True, True) } })
def test_retrieval(self): engine = create_engine("sqlite:///:memory:") db.metadata.create_all(engine) SessionMaker = sessionmaker(bind=engine) session = SessionMaker() transcript = Transcript(accept_undefined_multi=True) transcript.chrom = "15" transcript.source = "protein_coding" transcript.start = 47631264 transcript.end = 48051999 exons = [(47631264, 47631416), (47704590, 47704669), (47762671, 47762742), (47893062, 47893093), (47895572, 47895655), (48051942, 48051999)] transcript.strand = "+" transcript.add_exons(exons) transcript.id = "ENST00000560636" transcript.parent = "ENSG00000137872" transcript2 = transcript.copy() transcript2.id = "ENST00000560637" chrom_one = Chrom("1", 10**8) chrom_fifteen = Chrom("15", 5 * 10**8) session.add_all([chrom_one, chrom_fifteen]) session.commit() # junction_start, junction_end, name, strand, score, chrom_id) # This junction is on a different chrom junction_chrom_one = Junction(47704669 + 1, 47762671 - 1, "chrom_one", "+", 10, chrom_one.chrom_id) # This junction is too far away outside_chrom_15 = Junction(47704669 - 10**6 + 1, 47762671 - 10**6 - 1, "chrom_15_outside", "+", 10, chrom_fifteen.chrom_id) # This junction is in the right place but wrong strand wrong_strand_chrom_15 = Junction(47704669 + 1, 47762671 - 1, "chrom_15_wrong_strand", "-", 10, chrom_fifteen.chrom_id) # This one is correct chrom_15_junction = Junction(47704669 + 1, 47762671 - 1, "chrom_15", "+", 10, chrom_fifteen.chrom_id) session.add_all([ junction_chrom_one, outside_chrom_15, wrong_strand_chrom_15, chrom_15_junction ]) session.commit() self.assertEqual(junction_chrom_one.chrom, "1") for junc in [ outside_chrom_15, wrong_strand_chrom_15, chrom_15_junction ]: self.assertEqual(junc.chrom, "15") for strand, stranded in itertools.product(("+", "-", None), (True, False)): transcript.unfinalize() transcript.strand = strand transcript.finalize() sup = Superlocus(transcript, stranded=stranded) self.assertTrue( (chrom_15_junction.junction_start, chrom_15_junction.end) in sup.introns, (chrom_15_junction, sup.introns)) sup.session = session asyncio.run(sup._load_introns()) if stranded is True and strand is not None: self.assertEqual( sup.locus_verified_introns, {(chrom_15_junction.junction_start, chrom_15_junction.junction_end, strand)}, (stranded, strand)) elif stranded is False: self.assertEqual( sup.locus_verified_introns, {(chrom_15_junction.junction_start, chrom_15_junction.junction_end, chrom_15_junction.strand), (wrong_strand_chrom_15.junction_start, wrong_strand_chrom_15.junction_end, wrong_strand_chrom_15.strand)}, (stranded, strand)) elif stranded is True and strand is None: self.assertEqual(sup.locus_verified_introns, set())
def test_fusion(self): t = Transcript() t.chrom, t.strand, t.start, t.end, t.id, t.parent = "Chr1", "+", 101, 1000, "foo.1", "foo" t.add_exons([(101, 500), (601, 800), (901, 1000)]) t.finalize() t2 = Transcript() t2.chrom, t2.strand, t2.start, t2.end, t2.id, t2.parent = "Chr1", "+", 2001, 3000, "bar.1", "bar" t2.add_exons([(2001, 2500), (2601, 2800), (2901, 3000)]) t2.finalize() t3 = Transcript() t3.chrom, t3.strand, t3.start, t3.end, t3.id, t3.parent = "Chr1", "+", 651, 2703, "faz.1", "faz" t3.add_exons([(651, 800), (901, 1300), (2230, 2500), (2601, 2703)]) t3.finalize() logger = create_default_logger("test_fusion") with tempfile.TemporaryDirectory() as folder: with open(os.path.join(folder, "reference.gtf"), "wt") as reference: print(t.format("gtf"), file=reference) print(t2.format("gtf"), file=reference) self.assertTrue(os.path.exists(reference.name)) _ = [_ for _ in parser_factory(reference.name)] try: indexing.create_index(parser_factory(reference.name), logger, "{}.midx".format(reference.name)) except InvalidParsingFormat: self.assertFalse( True, "\n".join([line.rstrip() for line in open(reference.name)])) namespace = Namespace(default=False) namespace.out = os.path.join(folder, "out") for report in (False, True): with self.subTest(report=report): namespace.report_fusions = report assigner = Assigner("{}.midx".format(reference.name), args=namespace, printout_tmap=False) result = assigner.get_best(t3) if report: self.assertTrue(len(result), 2) self.assertTrue(result[0].ccode == ("f", "j"), str(result[0])) self.assertTrue(result[1].ccode == ("f", "j"), str(result[1])) else: self.assertTrue(result.ccode == ("j", ), str(result))
def main(): parser = argparse.ArgumentParser(__doc__) parser.add_argument("--bed12", nargs=2, required=True, help="Transcriptomic cDNAs BED12s") parser.add_argument("--cdnas", nargs=2, required=True) parser.add_argument("-gf", help="GFF3/BED12 of the transferred annotation.", required=True) parser.add_argument("--out", default=sys.stdout, type=argparse.FileType("wt")) parser.add_argument("-ob", "--out-bed", dest="out_bed", required=False, default=None, type=argparse.FileType("wt")) log = parser.add_mutually_exclusive_group() log.add_argument("-q", "--quiet", default=False, action="store_true") log.add_argument("-v", "--verbose", default=False, action="store_true") parser.add_argument("-p", "--processes", type=int, default=mp.cpu_count()) args = parser.parse_args() logger = create_default_logger("master") verbosity = "INFO" if args.verbose is True: verbosity = "DEBUG" elif args.quiet is True: verbosity = "WARNING" listener = logging.handlers.QueueListener(logging_queue, logger) listener.propagate = False listener.start() logger.setLevel(verbosity) cdnas = dict() beds = dict() beds["ref"] = dict() beds["target"] = dict() gmap_pat = re.compile("\.mrna[0-9]*$") logger.info("Loading reference cDNAS") cdnas["ref"] = pyfaidx.Fasta(args.cdnas[0]) logger.info("Loading target cDNAS") cdnas["target"] = pyfaidx.Fasta(args.cdnas[1]) logger.info("Loaded cDNAs") logger.info("Loading reference BED12") for entry in Bed12Parser(args.bed12[0], transcriptomic=True): if entry.header: continue name = entry.chrom if name in beds["ref"]: raise KeyError("Duplicated ID for the reference: {}".format(name)) if name not in cdnas["ref"]: raise KeyError("Reference {} not found in the cDNAs!".format(name)) beds["ref"][name] = entry logger.info("Loading target BED12") beds["target"] = defaultdict(dict) for entry in Bed12Parser(args.bed12[1], transcriptomic=True): # Now, here we have to account for the fact that there *might* be multiple alignments name = re.sub(gmap_pat, "", entry.chrom) if entry.chrom not in cdnas["target"]: raise KeyError("Target {} not found in the cDNAs!".format( entry.chrom)) beds["target"][name][entry.chrom] = entry logger.info("Loaded BED12s") # Now let us start parsing the GFF3, which we presume being a GMAP GFF3 transcript = None logger.info("Launching sub-processes") procs = [] queue = mp.Queue(-1) for proc in range(args.processes): sq = tempfile.NamedTemporaryFile(mode="wb") sq.close() sq = sq.name _proc = Transferer(sq, queue, verbosity=verbosity) _proc.start() procs.append(_proc) logger.info("Launched sub-processes, starting parsing annotation") # pool = mp.Pool(processes=args.processes) tnum = -1 if args.gf.endswith(("bed12", "bed")): parser = Bed12Parser(args.gf, transcriptomic=False) for line in parser: if line.header: continue else: transcript = Transcript(line) tid = re.sub(gmap_pat, "", transcript.id) logger.debug("Found %s", tid) ref_cdna = str(cdnas["ref"][tid]) ref_bed = beds["ref"][tid] target_cdna = str(cdnas["target"][transcript.id]) target_bed = beds["target"][tid][transcript.id] tnum += 1 logger.debug("Submitting %s", tid) queue.put((tnum, (transcript, ref_cdna, ref_bed, target_cdna, target_bed))) if tnum >= 10**4 and tnum % 10**4 == 0: logger.info("Parsed {} transcripts", tnum) logger.info("Finished parsing input genomic BED file") else: parser = to_gff(args.gf) for pos, line in enumerate(parser): if line.header is True: # or (not isinstance(line, BED12) and line.is_gene is True): if str(line) == "###": continue try: print(line, file=args.out) except IndexError: raise IndexError(line._line) continue elif not isinstance(line, BED12) and line.is_gene is True: continue elif line.is_transcript is True: if transcript: if transcript.alias is None: tid = re.sub(gmap_pat, "", transcript.id) else: tid = re.sub(gmap_pat, "", transcript.alias) ref_cdna = str(cdnas["ref"][tid]) ref_bed = beds["ref"][tid] target_cdna = str(cdnas["target"][transcript.id]) store = beds["target"].get(tid, None) if store is None: raise KeyError((tid, beds["target"].keys())) target_bed = store.get(transcript.id, None) if target_bed is None: raise KeyError((tid, store.keys())) tnum += 1 queue.put((tnum, (transcript, ref_cdna, ref_bed, target_cdna, target_bed))) try: transcript = Transcript(line) except (ValueError, TypeError): raise ValueError((pos, line)) elif line.is_exon is True: transcript.add_exon(line) if tnum >= 10**4 and tnum % 10**4 == 0: logger.info("Parsed {} transcripts", tnum) if transcript: tnum += 1 tid = re.sub(gmap_pat, "", transcript.id) ref_cdna = str(cdnas["ref"][tid]) ref_bed = beds["ref"][tid] target_cdna = str(cdnas["target"][transcript.id]) target_bed = beds["target"][tid][transcript.id] queue.put((tnum, (transcript, ref_cdna, ref_bed, target_cdna, target_bed))) logger.info("Finished parsing input genomic GF file") queue.put("EXIT") logger.info("Waiting for subprocesses to finish") [_proc.join() for _proc in procs] # Now the printing ... # results = dict() logger.info("Subprocesses finished, printing") for proc in procs: sq = sqlalchemy.create_engine("sqlite:///{}".format(proc.out_sq)) for res in sq.execute("select * from storer"): num, bed12, gff3 = res if args.out_bed is not None: print(bed12.decode(), file=args.out_bed) print(*gff3.decode().split("\n"), file=args.out, sep="\n") os.remove(proc.out_sq) logger.info("Finished!") return
def transfer_cds(transcript: Transcript, ref_cdna: str, ref_bed: BED12, target_cdna: str, target_bed: BED12, logger=create_null_logger()): if transcript is None: return transcript, target_bed, (None, None, False) transcript.finalize() assert target_bed.transcriptomic is True logger.debug("Starting with %s, phases: %s (BED %s)", transcript.id, transcript.phases, target_bed.phase) if ref_bed.coding is False: logger.debug("%s is non coding, returning immediately.", transcript.id, transcript.phases) transcript.attributes["aligner_cds"] = False transcript.attributes["was_coding"] = transcript.is_coding target_bed.coding = False transcript.strip_cds() pep_coords = (None, None, True) else: original_start, original_end = target_bed.thick_start, target_bed.thick_end original_phase, original_phases = target_bed.phase, transcript.phases.copy( ) ref_pep = str( Seq.Seq(str( ref_cdna[ref_bed.thick_start - 1:ref_bed.thick_end])).translate(to_stop=False)) ref_has_multiple_stops = False if ref_pep.count("*") == 0: pass elif abs(ref_pep.index("*") * 3 - ref_bed.cds_len) in (0, 3): ref_pep = ref_pep[:ref_pep.index( "*")] # This is the "good" case: the CDS is correct. else: ref_has_multiple_stops = True logger.warning( "The sequence of %s has in frame stop codons. Adjusting the program to take this into account.", ref_bed.name) logger.debug("%s now has phases: %s (%s)", transcript.id, transcript.phases, target_bed.phase) target_bed, pep_coords = transfer_by_alignment(ref_pep, target_cdna, target_bed, logger=logger) logger.debug("%s now has phases: %s; target bed: %s", transcript.id, transcript.phases, target_bed.phase) pep_coords = (pep_coords[0], pep_coords[1], (pep_coords[0] == 1 and pep_coords[1] == len(ref_pep))) if target_bed.thick_start == original_start and target_bed.thick_end == original_end: transcript.attributes["aligner_cds"] = True logger.debug("%s now has phases: %s", transcript.id, transcript.phases) else: transcript.attributes["aligner_cds"] = False transcript.strip_cds() if target_bed.coding is True: transcript.load_orfs([target_bed]) logger.debug("%s now has phases: %s", transcript.id, transcript.phases) # Now we have to decide whether the transcript has the "original" CDS or not result, cigar = transfer.get_and_prepare_cigar(str(ref_cdna), str(target_cdna)) ref_array, target_array = transfer.create_translation_array(cigar) try: target_start = target_array[ref_array.index(ref_bed.thick_start)] except IndexError: target_start = target_bed.start try: target_end = target_array[ref_array.index(ref_bed.thick_end)] except IndexError: target_end = target_bed.end if target_start == target_bed.thick_start and target_end == target_bed.thick_end: transcript.attributes["original_cds"] = True else: transcript.attributes["original_cds"] = False if ref_cdna == target_cdna: logger.debug("%s now has phases: %s", transcript.id, transcript.phases) if transcript.is_coding is False: raise AssertionError("{} not coding".format(transcript.id)) elif transcript.attributes["original_cds"] is False: raise AssertionError("\n".join([ str(_) for _ in [ transcript.id, (target_bed.thick_start, target_start, target_bed.thick_start == target_start), (target_bed.thick_end, target_end, target_bed.thick_end == target_end ), target_bed.thick_start == target_start and target_bed.thick_end == target_end ] ])) return transcript, target_bed, pep_coords
def create_transcript(tid: str, parent: str, lines: List[GtfLine], args: argparse.Namespace): """""" chroms = defaultdict(list) for line in lines: chroms[line.chrom].append(line) if len(chroms) > 1: # Recursively for chrom in chroms: newtid = tid + "." + chrom newparent = parent + "." + chrom for transcript in create_transcript(newtid, newparent, chroms[chrom], args): assert transcript.id == newtid, (newtid, transcript.id) assert transcript.parent[0] == newparent yield transcript else: # Now we are sure that we only have one chromosome exons = sorted([line for line in lines if line.is_exon], key=operator.attrgetter("chrom", "start", "end")) if len(exons) == 1: transcript = Transcript(exons[0]) transcript.id = tid transcript.parent = parent transcript.finalize() yield transcript else: new_exons = deque() identifier = ord("A") - 1 current = exons[0] for exon in exons[1:]: if ((overlap((exon.start, exon.end), (current.start, current.end)) > 0) or (exon.start - current.end + 1 <= args.min_intron and args.split is False)): # Merge the two exons current.end = exon.end elif ((exon.start - current.end + 1 <= args.min_intron and args.split is True) or exon.start - current.end + 1 > args.max_intron): # TODO: split new_exons.append(current) transcript = Transcript(new_exons.popleft()) transcript.add_exons(new_exons) transcript.finalize() identifier += 1 transcript.parent = parent + "." + chr(identifier) transcript.id = tid + "." + chr(identifier) yield transcript current = exon new_exons = deque() else: new_exons.append(current) current = exon new_exons.append(current) transcript = Transcript(new_exons.popleft()) transcript.add_exons(new_exons) if identifier == ord("A") - 1: transcript.id = tid transcript.parent = parent else: identifier += 1 transcript.id = tid + "." + chr(identifier) transcript.parent = parent + "." + chr(identifier) transcript.finalize() yield transcript