Пример #1
0
    def test_variant(self):
        """testing get mature sequence"""
        from mirtop.mirna import fasta, mapper
        from mirtop.mirna.realign import get_mature_sequence, \
            align_from_variants
        precursors = fasta.read_precursor("data/examples/annotate/hairpin.fa",
                                          "hsa")
        matures = mapper.read_gtf_to_precursor(
            "data/examples/annotate/hsa.gff3")
        res = get_mature_sequence("GAAAATTTTTTTTTTTAAAAG", [5, 15])
        if res != "AAAATTTTTTTTTTTAAAA":
            raise ValueError("Results for GAAAATTTTTTTTTTTAAAAG was %s" % res)
        mature = get_mature_sequence(precursors["hsa-let-7a-1"],
                                     matures["hsa-let-7a-1"]["hsa-let-7a-5p"])
        if mature != "GGGATGAGGTAGTAGGTTGTATAGTTTTAG":
            raise ValueError("Results for hsa-let-7a-5p is %s" % mature)

        res = align_from_variants("AGGTAGTAGGTTGTATAGTT", mature,
                                  "iso_5p:-2")
        if res:
            raise ValueError("Wrong alignment for test 1 %s" % res)
        res = align_from_variants("GATGAGGTAGTAGGTTGTATAGTT", mature,
                                  "iso_5p:+2")
        if res:
            raise ValueError("Wrong alignment for test 2 %s" % res)
        res = align_from_variants("AGGTAGTAGGTTGTATAGTTTT", mature,
                                  "iso_5p:-2,iso_add:2")
        if res:
            raise ValueError("Wrong alignment for test 3 %s" % res)
        res = align_from_variants("AGGTAGTAGGTTGTATAGTTTT", mature,
                                  "iso_5p:-2,iso_3p:2")
        if res:
            raise ValueError("Wrong alignment for test 4 %s" % res)
        res = align_from_variants("AGGTAGTAGGTTGTATAG", mature,
                                  "iso_5p:-2,iso_3p:-2")
        if res:
            raise ValueError("Wrong alignment for test 5 %s" % res)
        res = align_from_variants("AGGTAGTAGGTTGTATAGAA", mature,
                                  "iso_5p:-2,iso_3p:-2,iso_add:2")
        if res:
            raise ValueError("Wrong alignment for test 6 %s" % res)
        res = align_from_variants("AGGTAGTAGGATGTATAGTT", mature,
                                  "iso_5p:-2,iso_snp_central")
        if not res:
            if res[0][0] != 10:
                raise ValueError("Wrong alignment for test 7 %s" % res)
        res = align_from_variants("AGGTAGTAGGATGTATAGAA", mature,
                                  "iso_5p:-2,iso_3p:-2,iso_add:2")
        if res:
            raise ValueError("Wrong alignment for test 8 %s" % res)
Пример #2
0
    def test_variant(self):
        """testing get mature sequence"""
        from mirtop.mirna import fasta, mapper
        from mirtop.mirna.realign import get_mature_sequence, \
            align_from_variants
        precursors = fasta.read_precursor("data/examples/annotate/hairpin.fa",
                                          "hsa")
        matures = mapper.read_gtf_to_precursor(
            "data/examples/annotate/hsa.gff3")
        res = get_mature_sequence("GAAAATTTTTTTTTTTAAAAG", [5, 15])
        if res != "AAAATTTTTTTTTTTAAAA":
            raise ValueError("Results for GAAAATTTTTTTTTTTAAAAG was %s" % res)
        mature = get_mature_sequence(precursors["hsa-let-7a-1"],
                                     matures["hsa-let-7a-1"]["hsa-let-7a-5p"])
        if mature != "GGGATGAGGTAGTAGGTTGTATAGTTTTAG":
            raise ValueError("Results for hsa-let-7a-5p is %s" % mature)

        res = align_from_variants("AGGTAGTAGGTTGTATAGTT", mature, "iso_5p:-2")
        if res:
            raise ValueError("Wrong alignment for test 1 %s" % res)
        res = align_from_variants("GATGAGGTAGTAGGTTGTATAGTT", mature,
                                  "iso_5p:+2")
        if res:
            raise ValueError("Wrong alignment for test 2 %s" % res)
        res = align_from_variants("AGGTAGTAGGTTGTATAGTTTT", mature,
                                  "iso_5p:-2,iso_add:2")
        if res:
            raise ValueError("Wrong alignment for test 3 %s" % res)
        res = align_from_variants("AGGTAGTAGGTTGTATAGTTTT", mature,
                                  "iso_5p:-2,iso_3p:2")
        if res:
            raise ValueError("Wrong alignment for test 4 %s" % res)
        res = align_from_variants("AGGTAGTAGGTTGTATAG", mature,
                                  "iso_5p:-2,iso_3p:-2")
        if res:
            raise ValueError("Wrong alignment for test 5 %s" % res)
        res = align_from_variants("AGGTAGTAGGTTGTATAGAA", mature,
                                  "iso_5p:-2,iso_3p:-2,iso_add:2")
        if res:
            raise ValueError("Wrong alignment for test 6 %s" % res)
        res = align_from_variants("AGGTAGTAGGATGTATAGTT", mature,
                                  "iso_5p:-2,iso_snp_central")
        if not res:
            if res[0][0] != 10:
                raise ValueError("Wrong alignment for test 7 %s" % res)
        res = align_from_variants("AGGTAGTAGGATGTATAGAA", mature,
                                  "iso_5p:-2,iso_3p:-2,iso_add:2")
        if res:
            raise ValueError("Wrong alignment for test 8 %s" % res)
Пример #3
0
def variant_with_nt(line, precursors, matures):
    """
    Return nucleotides changes for each variant type
    using Variant attribute, precursor sequences and
    mature position.
    """
    cols = read_gff_line(line)
    attr = cols["attrb"]
    read = read_id(attr["UID"])
    logger.debug("GFF::BODY::precursors %s" % precursors[attr["Parent"]])
    logger.debug("GFF:BODY::mature %s" % matures[attr["Parent"]][attr["Name"]])
    t5 = variant_to_5p(precursors[attr["Parent"]],
                       matures[attr["Parent"]][attr["Name"]],
                       attr["Variant"])
    t3 = variant_to_3p(precursors[attr["Parent"]],
                       matures[attr["Parent"]][attr["Name"]],
                       attr["Variant"])
    add = variant_to_add(read,
                         attr["Variant"])
    mature_sequence = get_mature_sequence(
        precursors[attr["Parent"]],
        matures[attr["Parent"]][attr["Name"]])
    logger.debug("GFF::BODY::mature_sequence %s" % mature_sequence)
    mm = align_from_variants(read,
                             mature_sequence,
                             attr["Variant"])
    if mm == "Invalid":
        return mm
    if len(mm) > 0:
        mm = "".join(["".join(map(str, m)) for m in mm])
    else:
        mm = "0"
    return "iso_5p:%s,iso_3p:%s,iso_add:%s,iso_snp:%s" % (t5, t3, add, mm)
Пример #4
0
def _align_to_mature(seq, hairpin, mature):
    """Get alignment between seq and mature"""
    mirna = get_mature_sequence(hairpin, mature)
    hit = align(seq, mirna)
    start = hit[0][:8].count("-") - 4 + int(mature[0])
    logger.debug("PROST::align:sequence to mature %s" % hit[0])
    logger.debug("PROST::align:start: %s -> %s" % (mature[0], start))
    return start
Пример #5
0
def _read_file(fn, precursors, matures, out_dir):
    samples = read_samples(fn)
    for sample in samples:
        with open(os.path.join(out_dir, "%s.mirna" % sample), 'w') as outh:
            print("\t".join([
                "seq", "name", "freq", "mir", "start", "end", "mism", "add",
                "t5", "t3", "s5", "s3", "DB", "precursor", "ambiguity"
            ]),
                  file=outh)
    with open(fn) as inh:
        for line in inh:
            if line.startswith("#"):
                continue
            gff = feature(line)
            cols = gff.columns
            attr = gff.attributes
            read = read_id(attr["UID"])
            t5 = variant_to_5p(precursors[attr["Parent"]],
                               matures[attr["Parent"]][attr["Name"]],
                               attr["Variant"])
            t3 = variant_to_3p(precursors[attr["Parent"]],
                               matures[attr["Parent"]][attr["Name"]],
                               attr["Variant"])
            add = variant_to_add(read, attr["Variant"])
            mature_sequence = get_mature_sequence(
                precursors[attr["Parent"]],
                matures[attr["Parent"]][attr["Name"]])
            mm = align_from_variants(read, mature_sequence, attr["Variant"])
            if len(mm) > 1:
                continue
            elif len(mm) == 1:
                mm = "".join(list(map(str, mm[0])))
            else:
                mm = "0"
            hit = attr["Hits"] if "Hits" in attr else "1"
            logger.debug("exporter::isomir::decode %s" %
                         [attr["Variant"], t5, t3, add, mm])
            # Error if attr["Read"] doesn't exist
            # print(cols)
            line = [
                read, attr["Read"], "0", attr["Name"], cols['source'],
                cols['type'], mm, add, t5, t3, "NA", "NA", "miRNA",
                attr["Parent"], hit
            ]
            for sample, counts in zip(samples, attr["Expression"].split(",")):
                with open(os.path.join(out_dir, "%s.mirna" % sample),
                          'a') as outh:
                    line[2] = counts
                    print("\t".join(line), file=outh)
Пример #6
0
def _read_file(fn, precursors, matures, out_dir):
    samples = read_samples(fn)
    for sample in samples:
        with open(os.path.join(out_dir, "%s.mirna" % sample), 'w') as outh:
            print("\t".join(
                ["seq", "name", "freq", "mir", "start", "end",
                 "mism", "add", "t5", "t3", "s5", "s3", "DB",
                 "precursor", "ambiguity"]), file=outh)
    with open(fn) as inh:
        for line in inh:
            if line.startswith("#"):
                continue
            cols = line.strip().split("\t")
            attr = read_attributes(line)
            read = read_id(attr["UID"])
            t5 = variant_to_5p(precursors[attr["Parent"]],
                               matures[attr["Parent"]][attr["Name"]],
                               attr["Variant"])
            t3 = variant_to_3p(precursors[attr["Parent"]],
                               matures[attr["Parent"]][attr["Name"]],
                               attr["Variant"])
            add = variant_to_add(read,
                                 attr["Variant"])
            mature_sequence = get_mature_sequence(
                precursors[attr["Parent"]],
                matures[attr["Parent"]][attr["Name"]])
            mm = align_from_variants(read,
                                     mature_sequence,
                                     attr["Variant"])
            if len(mm) > 1:
                continue
            elif len(mm) == 1:
                mm = "".join(map(str, mm[0]))
            else:
                mm = "0"
            hit = attr["Hits"] if "Hits" in attr else "1"
            logger.debug("exporter::isomir::decode %s" % [attr["Variant"],
                                                          t5, t3, add, mm])
            # Error if attr["Read"] doesn't exist
            line = [read, attr["Read"], "0", attr["Name"], cols[1], cols[2],
                    mm, add, t5, t3, "NA", "NA", "miRNA",  attr["Parent"], hit]
            for sample, counts in zip(samples, attr["Expression"].split(",")):
                with open(os.path.join(out_dir, "%s.mirna" % sample),
                          'a') as outh:
                    line[2] = counts
                    print("\t".join(line), file=outh)
Пример #7
0
def variant_with_nt(line, precursors, matures):
    """
    Return nucleotides changes for each variant type
    using Variant attribute, precursor sequences and
    mature position.
    """
    gff = feature(line)
    attr = gff.attributes
    read = read_id(attr["UID"])
    attr["Parent"] = attr["Parent"].split(",")[0]
    if attr["Parent"] not in matures:
        logger.warning("Parent miRNA not found in database %s" % attr["Parent"])
        return ""
    if attr["Name"] not in matures[attr["Parent"]]:
        logger.warning("miRNA not found in database %s" % attr["Name"])
        return ""

    logger.debug("GFF::BODY::precursors %s" % precursors[attr["Parent"]])
    logger.debug("GFF:BODY::mature %s" % matures[attr["Parent"]][attr["Name"]])

    t5 = variant_to_5p(precursors[attr["Parent"]],
                       matures[attr["Parent"]][attr["Name"]],
                       attr["Variant"])
    t3 = variant_to_3p(precursors[attr["Parent"]],
                       matures[attr["Parent"]][attr["Name"]],
                       attr["Variant"])
    add = variant_to_add(read,
                         attr["Variant"])
    mature_sequence = get_mature_sequence(
        precursors[attr["Parent"]],
        matures[attr["Parent"]][attr["Name"]],
        nt=8)
    logger.debug("GFF::BODY::mature_sequence %s" % mature_sequence)
    mm = align_from_variants(read,
                             mature_sequence,
                             attr["Variant"])
    if mm == "Invalid":
        return mm
    if len(mm) > 0:
        mm = "".join(["".join([str(v) for v in m]) for m in mm])
    else:
        mm = "0"
    return "iso_5p:%s,iso_3p:%s,iso_add3p:%s,iso_snv:%s" % (t5, t3, add, mm)
Пример #8
0
    def test_spikeins(self):
        """Test spikeins reading and annotation"""
        from mirtop.libs import spikeins
        from mirtop.mirna.realign import get_mature_sequence
        load = spikeins.read_spikeins("data/examples/spikeins/spikeins.fa")
        print(load)
        load1 = load['spikein-1']
        mature_from_data = get_mature_sequence(load1['precursor'],
                                               load1['position'],
                                               exact=True)
        if mature_from_data != load1['mature']:
            raise ValueError("Sequences doesn't match \n%s\n%s" %
                             (mature_from_data, load1['mature']))

        file_fasta = "data/examples/spikeins/spikeins_pre.fasta"
        file_gff = "data/examples/spikeins/spikeins_pre.gff"
        spikeins.write_precursors(load, file_fasta)
        spikeins.write_gff(load, file_gff)

        from mirtop.mirna import mapper, fasta
        map_mir = mapper.read_gtf_to_mirna(file_gff)
        print(map_mir)
        fasta_precursor = fasta.read_precursor(file_fasta, None)
        print(fasta_precursor)