def test_variant(self): """testing get mature sequence""" from mirtop.mirna import fasta, mapper from mirtop.mirna.realign import get_mature_sequence, \ align_from_variants precursors = fasta.read_precursor("data/examples/annotate/hairpin.fa", "hsa") matures = mapper.read_gtf_to_precursor( "data/examples/annotate/hsa.gff3") res = get_mature_sequence("GAAAATTTTTTTTTTTAAAAG", [5, 15]) if res != "AAAATTTTTTTTTTTAAAA": raise ValueError("Results for GAAAATTTTTTTTTTTAAAAG was %s" % res) mature = get_mature_sequence(precursors["hsa-let-7a-1"], matures["hsa-let-7a-1"]["hsa-let-7a-5p"]) if mature != "GGGATGAGGTAGTAGGTTGTATAGTTTTAG": raise ValueError("Results for hsa-let-7a-5p is %s" % mature) res = align_from_variants("AGGTAGTAGGTTGTATAGTT", mature, "iso_5p:-2") if res: raise ValueError("Wrong alignment for test 1 %s" % res) res = align_from_variants("GATGAGGTAGTAGGTTGTATAGTT", mature, "iso_5p:+2") if res: raise ValueError("Wrong alignment for test 2 %s" % res) res = align_from_variants("AGGTAGTAGGTTGTATAGTTTT", mature, "iso_5p:-2,iso_add:2") if res: raise ValueError("Wrong alignment for test 3 %s" % res) res = align_from_variants("AGGTAGTAGGTTGTATAGTTTT", mature, "iso_5p:-2,iso_3p:2") if res: raise ValueError("Wrong alignment for test 4 %s" % res) res = align_from_variants("AGGTAGTAGGTTGTATAG", mature, "iso_5p:-2,iso_3p:-2") if res: raise ValueError("Wrong alignment for test 5 %s" % res) res = align_from_variants("AGGTAGTAGGTTGTATAGAA", mature, "iso_5p:-2,iso_3p:-2,iso_add:2") if res: raise ValueError("Wrong alignment for test 6 %s" % res) res = align_from_variants("AGGTAGTAGGATGTATAGTT", mature, "iso_5p:-2,iso_snp_central") if not res: if res[0][0] != 10: raise ValueError("Wrong alignment for test 7 %s" % res) res = align_from_variants("AGGTAGTAGGATGTATAGAA", mature, "iso_5p:-2,iso_3p:-2,iso_add:2") if res: raise ValueError("Wrong alignment for test 8 %s" % res)
def variant_with_nt(line, precursors, matures): """ Return nucleotides changes for each variant type using Variant attribute, precursor sequences and mature position. """ cols = read_gff_line(line) attr = cols["attrb"] read = read_id(attr["UID"]) logger.debug("GFF::BODY::precursors %s" % precursors[attr["Parent"]]) logger.debug("GFF:BODY::mature %s" % matures[attr["Parent"]][attr["Name"]]) t5 = variant_to_5p(precursors[attr["Parent"]], matures[attr["Parent"]][attr["Name"]], attr["Variant"]) t3 = variant_to_3p(precursors[attr["Parent"]], matures[attr["Parent"]][attr["Name"]], attr["Variant"]) add = variant_to_add(read, attr["Variant"]) mature_sequence = get_mature_sequence( precursors[attr["Parent"]], matures[attr["Parent"]][attr["Name"]]) logger.debug("GFF::BODY::mature_sequence %s" % mature_sequence) mm = align_from_variants(read, mature_sequence, attr["Variant"]) if mm == "Invalid": return mm if len(mm) > 0: mm = "".join(["".join(map(str, m)) for m in mm]) else: mm = "0" return "iso_5p:%s,iso_3p:%s,iso_add:%s,iso_snp:%s" % (t5, t3, add, mm)
def _align_to_mature(seq, hairpin, mature): """Get alignment between seq and mature""" mirna = get_mature_sequence(hairpin, mature) hit = align(seq, mirna) start = hit[0][:8].count("-") - 4 + int(mature[0]) logger.debug("PROST::align:sequence to mature %s" % hit[0]) logger.debug("PROST::align:start: %s -> %s" % (mature[0], start)) return start
def _read_file(fn, precursors, matures, out_dir): samples = read_samples(fn) for sample in samples: with open(os.path.join(out_dir, "%s.mirna" % sample), 'w') as outh: print("\t".join([ "seq", "name", "freq", "mir", "start", "end", "mism", "add", "t5", "t3", "s5", "s3", "DB", "precursor", "ambiguity" ]), file=outh) with open(fn) as inh: for line in inh: if line.startswith("#"): continue gff = feature(line) cols = gff.columns attr = gff.attributes read = read_id(attr["UID"]) t5 = variant_to_5p(precursors[attr["Parent"]], matures[attr["Parent"]][attr["Name"]], attr["Variant"]) t3 = variant_to_3p(precursors[attr["Parent"]], matures[attr["Parent"]][attr["Name"]], attr["Variant"]) add = variant_to_add(read, attr["Variant"]) mature_sequence = get_mature_sequence( precursors[attr["Parent"]], matures[attr["Parent"]][attr["Name"]]) mm = align_from_variants(read, mature_sequence, attr["Variant"]) if len(mm) > 1: continue elif len(mm) == 1: mm = "".join(list(map(str, mm[0]))) else: mm = "0" hit = attr["Hits"] if "Hits" in attr else "1" logger.debug("exporter::isomir::decode %s" % [attr["Variant"], t5, t3, add, mm]) # Error if attr["Read"] doesn't exist # print(cols) line = [ read, attr["Read"], "0", attr["Name"], cols['source'], cols['type'], mm, add, t5, t3, "NA", "NA", "miRNA", attr["Parent"], hit ] for sample, counts in zip(samples, attr["Expression"].split(",")): with open(os.path.join(out_dir, "%s.mirna" % sample), 'a') as outh: line[2] = counts print("\t".join(line), file=outh)
def _read_file(fn, precursors, matures, out_dir): samples = read_samples(fn) for sample in samples: with open(os.path.join(out_dir, "%s.mirna" % sample), 'w') as outh: print("\t".join( ["seq", "name", "freq", "mir", "start", "end", "mism", "add", "t5", "t3", "s5", "s3", "DB", "precursor", "ambiguity"]), file=outh) with open(fn) as inh: for line in inh: if line.startswith("#"): continue cols = line.strip().split("\t") attr = read_attributes(line) read = read_id(attr["UID"]) t5 = variant_to_5p(precursors[attr["Parent"]], matures[attr["Parent"]][attr["Name"]], attr["Variant"]) t3 = variant_to_3p(precursors[attr["Parent"]], matures[attr["Parent"]][attr["Name"]], attr["Variant"]) add = variant_to_add(read, attr["Variant"]) mature_sequence = get_mature_sequence( precursors[attr["Parent"]], matures[attr["Parent"]][attr["Name"]]) mm = align_from_variants(read, mature_sequence, attr["Variant"]) if len(mm) > 1: continue elif len(mm) == 1: mm = "".join(map(str, mm[0])) else: mm = "0" hit = attr["Hits"] if "Hits" in attr else "1" logger.debug("exporter::isomir::decode %s" % [attr["Variant"], t5, t3, add, mm]) # Error if attr["Read"] doesn't exist line = [read, attr["Read"], "0", attr["Name"], cols[1], cols[2], mm, add, t5, t3, "NA", "NA", "miRNA", attr["Parent"], hit] for sample, counts in zip(samples, attr["Expression"].split(",")): with open(os.path.join(out_dir, "%s.mirna" % sample), 'a') as outh: line[2] = counts print("\t".join(line), file=outh)
def variant_with_nt(line, precursors, matures): """ Return nucleotides changes for each variant type using Variant attribute, precursor sequences and mature position. """ gff = feature(line) attr = gff.attributes read = read_id(attr["UID"]) attr["Parent"] = attr["Parent"].split(",")[0] if attr["Parent"] not in matures: logger.warning("Parent miRNA not found in database %s" % attr["Parent"]) return "" if attr["Name"] not in matures[attr["Parent"]]: logger.warning("miRNA not found in database %s" % attr["Name"]) return "" logger.debug("GFF::BODY::precursors %s" % precursors[attr["Parent"]]) logger.debug("GFF:BODY::mature %s" % matures[attr["Parent"]][attr["Name"]]) t5 = variant_to_5p(precursors[attr["Parent"]], matures[attr["Parent"]][attr["Name"]], attr["Variant"]) t3 = variant_to_3p(precursors[attr["Parent"]], matures[attr["Parent"]][attr["Name"]], attr["Variant"]) add = variant_to_add(read, attr["Variant"]) mature_sequence = get_mature_sequence( precursors[attr["Parent"]], matures[attr["Parent"]][attr["Name"]], nt=8) logger.debug("GFF::BODY::mature_sequence %s" % mature_sequence) mm = align_from_variants(read, mature_sequence, attr["Variant"]) if mm == "Invalid": return mm if len(mm) > 0: mm = "".join(["".join([str(v) for v in m]) for m in mm]) else: mm = "0" return "iso_5p:%s,iso_3p:%s,iso_add3p:%s,iso_snv:%s" % (t5, t3, add, mm)
def test_spikeins(self): """Test spikeins reading and annotation""" from mirtop.libs import spikeins from mirtop.mirna.realign import get_mature_sequence load = spikeins.read_spikeins("data/examples/spikeins/spikeins.fa") print(load) load1 = load['spikein-1'] mature_from_data = get_mature_sequence(load1['precursor'], load1['position'], exact=True) if mature_from_data != load1['mature']: raise ValueError("Sequences doesn't match \n%s\n%s" % (mature_from_data, load1['mature'])) file_fasta = "data/examples/spikeins/spikeins_pre.fasta" file_gff = "data/examples/spikeins/spikeins_pre.gff" spikeins.write_precursors(load, file_fasta) spikeins.write_gff(load, file_gff) from mirtop.mirna import mapper, fasta map_mir = mapper.read_gtf_to_mirna(file_gff) print(map_mir) fasta_precursor = fasta.read_precursor(file_fasta, None) print(fasta_precursor)