예제 #1
0
def _convert_file(gff, args):
    sep = "\t"
    precursors = fasta.read_precursor(args.hairpin, args.sps)
    matures = mapper.read_gtf_to_precursor(args.gtf)
    variant_header = sep.join(['mism', 'add', 't5', 't3'])

    gff_file = open(gff, 'r')
    out_file = os.path.join(args.out, "%s_rawData.tsv" % os.path.splitext(os.path.basename(gff))[0])
    missing_parent = 0
    missing_mirna = 0
    unvalid_uid = 0
    with open(out_file, 'w') as outh:

        for samples_line in gff_file:
            if samples_line.startswith("## COLDATA:"):
                samples = sep.join(samples_line.strip().split("COLDATA:")[1].strip().split(","))
                header = sep.join(['seq', 'mir',
                                   variant_header, samples])
                print(header, file=outh)
                break

        for mirna_line in gff_file:
            gff = feature(mirna_line)
            attr = gff.attributes
            UID = attr["UID"]
            Read = attr["Read"]
            mirna = attr["Name"]
            parent = attr["Parent"]
            variant = attr["Variant"]
            try:
                Read = read_id(UID)
            except KeyError:
                unvalid_uid += 1
                continue

            expression = sep.join(attr["Expression"].strip().split(","))
            cols_variants = sep.join(_expand(variant))
            logger.debug("COUNTS::Read:%s" % Read)
            logger.debug("COUNTS::EXTRA:%s" % variant)
            if parent not in precursors:
                missing_parent += 1
                continue
            if mirna not in matures[parent]:
                missing_mirna += 1
                continue
            extra = variant_with_nt(mirna_line, precursors, matures)
            if extra == "Invalid":
                continue
            logger.debug("COUNTS::EXTRA:%s" % extra)
            cols_variants = sep.join(_expand(extra, True))
            summary = sep.join([Read,  mirna,
                                cols_variants, expression])
            logger.debug(summary)
            print(summary, file=outh)

    gff_file.close()
    logger.info("Missing Parents in hairpin file: %s" % missing_parent)
    logger.info("Missing MiRNAs in GFF file: %s" % missing_mirna)
    logger.info("Non valid UID: %s" % unvalid_uid)
    logger.info("Output file is at %s" % out_file)
예제 #2
0
파일: isomirs.py 프로젝트: smoe/mirtop
def convert(args):
    samples = []
    database = mapper.guess_database(args.gtf)
    precursors = fasta.read_precursor(args.hairpin, args.sps)
    matures = mapper.read_gtf_to_precursor(args.gtf)
    for fn in args.files:
        read_file(fn, precursors, matures)
예제 #3
0
def reader(args):
    """
    Realign BAM hits to miRBAse to get better accuracy and annotation
    """
    if args.low_memory:
        read.reader(args)
        return None
    samples = []
    database = mapper.guess_database(args)
    args.database = database
    precursors = fasta.read_precursor(args.hairpin, args.sps)
    args.precursors = precursors
    matures = mapper.read_gtf_to_precursor(args.gtf)
    args.matures = matures
    # TODO check numbers of miRNA and precursors read
    # TODO print message if numbers mismatch
    out_dts = dict()
    if args.keep_name and len(args.files) > 1:
        logger.warning("--keep-name when running multiple samples\n"
                       "can generate wrong results if the\n"
                       "name read is different across sample\n"
                       "for the same sequence.")
    for fn in args.files:
        fn = op.normpath(fn)
        if args.format != "gff":
            sample = op.splitext(op.basename(fn))[0]
            samples.append(sample)
            fn_out = op.join(args.out, sample + ".%s" % args.out_format)
        if args.format == "BAM":
            reads = _read_bam(fn, args)
        elif args.format == "seqbuster":
            reads = seqbuster.read_file(fn, args)
        elif args.format == "srnabench":
            out_dts[fn] = srnabench.read_file(fn, args)
        elif args.format == "prost":
            reads = prost.read_file(fn, precursors, database, args.gtf)
        elif args.format == "isomirsea":
            out_dts[fn] = isomirsea.read_file(fn, args)
        elif args.format == "manatee":
            out_dts[fn] = manatee.read_file(fn, database, args)
        elif args.format == "optimir":
            out_dts[fn] = optimir.read_file(fn, args)
        elif args.format == "gff":
            samples.extend(header.read_samples(fn))
            out_dts[fn] = body.read(fn, args)
            continue
        if args.format not in ["isomirsea", "srnabench", "manatee", 'optimir']:
            ann = annotate(reads, matures, precursors)
            out_dts[fn] = body.create(ann, database, sample, args)
        h = header.create([sample], database, header.make_tools(args.format))
        _write(out_dts[fn], h, fn_out, args)
    # merge all reads for all samples into one dict
    if args.low_memory:
        return None
    merged = merge.merge(out_dts, samples)
    fn_merged_out = op.join(args.out, "mirtop.%s" % args.out_format)
    _write(merged,
           header.create(samples, database, header.make_tools([args.format])),
           fn_merged_out, args)
예제 #4
0
파일: isomirs.py 프로젝트: miRTop/mirtop
def convert(args):
    """
    Main function to convert from GFF3 to isomiRs Bioc Package.

    Args:
      *args*: supported options for this sub-command.
        See *mirtop.libs.parse.add_subparser_export()*.
    """
    precursors = fasta.read_precursor(args.hairpin, args.sps)
    matures = mapper.read_gtf_to_precursor(args.gtf)
    for fn in args.files:
        logger.info("Reading %s" % fn)
        _read_file(fn, precursors, matures, args.out)
예제 #5
0
 def test_read(self):
     from mirtop.mirna import mapper, fasta
     from mirtop.libs import logger
     logger.initialize_logger("test_read_files", True, True)
     map_mir = mapper.read_gtf_to_precursor(
         "data/examples/annotate/hsa.gff3")
     print map_mir
     if map_mir["hsa-let-7a-1"]["hsa-let-7a-5p"][0] != 5:
         raise ValueError("GFF is not loaded correctly.")
     fasta_precursor = fasta.read_precursor(
         "data/examples/annotate/hairpin.fa", "hsa")
     # read data/aligments/let7-perfect.bam
     return True
예제 #6
0
def convert(args):
    """
    Main function to convert from GFF3 to isomiRs Bioc Package.

    Args:
      *args*: supported options for this sub-command.
        See *mirtop.libs.parse.add_subparser_export()*.
    """
    precursors = fasta.read_precursor(args.hairpin, args.sps)
    matures = mapper.read_gtf_to_precursor(args.gtf)
    for fn in args.files:
        logger.info("Reading %s" % fn)
        _read_file(fn, precursors, matures, args.out)
예제 #7
0
    def test_variant(self):
        """testing get mature sequence"""
        from mirtop.mirna import fasta, mapper
        from mirtop.mirna.realign import get_mature_sequence, \
            align_from_variants
        precursors = fasta.read_precursor("data/examples/annotate/hairpin.fa",
                                          "hsa")
        matures = mapper.read_gtf_to_precursor(
            "data/examples/annotate/hsa.gff3")
        res = get_mature_sequence("GAAAATTTTTTTTTTTAAAAG", [5, 15])
        if res != "AAAATTTTTTTTTTTAAAA":
            raise ValueError("Results for GAAAATTTTTTTTTTTAAAAG was %s" % res)
        mature = get_mature_sequence(precursors["hsa-let-7a-1"],
                                     matures["hsa-let-7a-1"]["hsa-let-7a-5p"])
        if mature != "GGGATGAGGTAGTAGGTTGTATAGTTTTAG":
            raise ValueError("Results for hsa-let-7a-5p is %s" % mature)

        res = align_from_variants("AGGTAGTAGGTTGTATAGTT", mature,
                                  "iso_5p:-2")
        if res:
            raise ValueError("Wrong alignment for test 1 %s" % res)
        res = align_from_variants("GATGAGGTAGTAGGTTGTATAGTT", mature,
                                  "iso_5p:+2")
        if res:
            raise ValueError("Wrong alignment for test 2 %s" % res)
        res = align_from_variants("AGGTAGTAGGTTGTATAGTTTT", mature,
                                  "iso_5p:-2,iso_add:2")
        if res:
            raise ValueError("Wrong alignment for test 3 %s" % res)
        res = align_from_variants("AGGTAGTAGGTTGTATAGTTTT", mature,
                                  "iso_5p:-2,iso_3p:2")
        if res:
            raise ValueError("Wrong alignment for test 4 %s" % res)
        res = align_from_variants("AGGTAGTAGGTTGTATAG", mature,
                                  "iso_5p:-2,iso_3p:-2")
        if res:
            raise ValueError("Wrong alignment for test 5 %s" % res)
        res = align_from_variants("AGGTAGTAGGTTGTATAGAA", mature,
                                  "iso_5p:-2,iso_3p:-2,iso_add:2")
        if res:
            raise ValueError("Wrong alignment for test 6 %s" % res)
        res = align_from_variants("AGGTAGTAGGATGTATAGTT", mature,
                                  "iso_5p:-2,iso_snp_central")
        if not res:
            if res[0][0] != 10:
                raise ValueError("Wrong alignment for test 7 %s" % res)
        res = align_from_variants("AGGTAGTAGGATGTATAGAA", mature,
                                  "iso_5p:-2,iso_3p:-2,iso_add:2")
        if res:
            raise ValueError("Wrong alignment for test 8 %s" % res)
예제 #8
0
    def test_variant(self):
        """testing get mature sequence"""
        from mirtop.mirna import fasta, mapper
        from mirtop.mirna.realign import get_mature_sequence, \
            align_from_variants
        precursors = fasta.read_precursor("data/examples/annotate/hairpin.fa",
                                          "hsa")
        matures = mapper.read_gtf_to_precursor(
            "data/examples/annotate/hsa.gff3")
        res = get_mature_sequence("GAAAATTTTTTTTTTTAAAAG", [5, 15])
        if res != "AAAATTTTTTTTTTTAAAA":
            raise ValueError("Results for GAAAATTTTTTTTTTTAAAAG was %s" % res)
        mature = get_mature_sequence(precursors["hsa-let-7a-1"],
                                     matures["hsa-let-7a-1"]["hsa-let-7a-5p"])
        if mature != "GGGATGAGGTAGTAGGTTGTATAGTTTTAG":
            raise ValueError("Results for hsa-let-7a-5p is %s" % mature)

        res = align_from_variants("AGGTAGTAGGTTGTATAGTT", mature, "iso_5p:-2")
        if res:
            raise ValueError("Wrong alignment for test 1 %s" % res)
        res = align_from_variants("GATGAGGTAGTAGGTTGTATAGTT", mature,
                                  "iso_5p:+2")
        if res:
            raise ValueError("Wrong alignment for test 2 %s" % res)
        res = align_from_variants("AGGTAGTAGGTTGTATAGTTTT", mature,
                                  "iso_5p:-2,iso_add:2")
        if res:
            raise ValueError("Wrong alignment for test 3 %s" % res)
        res = align_from_variants("AGGTAGTAGGTTGTATAGTTTT", mature,
                                  "iso_5p:-2,iso_3p:2")
        if res:
            raise ValueError("Wrong alignment for test 4 %s" % res)
        res = align_from_variants("AGGTAGTAGGTTGTATAG", mature,
                                  "iso_5p:-2,iso_3p:-2")
        if res:
            raise ValueError("Wrong alignment for test 5 %s" % res)
        res = align_from_variants("AGGTAGTAGGTTGTATAGAA", mature,
                                  "iso_5p:-2,iso_3p:-2,iso_add:2")
        if res:
            raise ValueError("Wrong alignment for test 6 %s" % res)
        res = align_from_variants("AGGTAGTAGGATGTATAGTT", mature,
                                  "iso_5p:-2,iso_snp_central")
        if not res:
            if res[0][0] != 10:
                raise ValueError("Wrong alignment for test 7 %s" % res)
        res = align_from_variants("AGGTAGTAGGATGTATAGAA", mature,
                                  "iso_5p:-2,iso_3p:-2,iso_add:2")
        if res:
            raise ValueError("Wrong alignment for test 8 %s" % res)
예제 #9
0
 def test_read(self):
     from mirtop.mirna import mapper, fasta
     from mirtop.libs import logger
     logger.initialize_logger("test_read_files", True, True)
     map_mir = mapper.read_gtf_to_precursor(
         "data/examples/annotate/hsa.gff3")
     print(map_mir)
     if map_mir["hsa-let-7a-1"]["hsa-let-7a-5p"][0] != 5:
         raise ValueError("GFF is not loaded correctly.")
     fasta_precursor = fasta.read_precursor(
         "data/examples/annotate/hairpin.fa", "hsa")
     print(fasta_precursor)
     fasta_precursor2 = fasta.read_precursor(
         "data/examples/annotate/hairpin.fa", None)
     print(fasta_precursor2)
     if fasta_precursor != fasta_precursor2:
         raise ValueError("species value generates two different dicts.")
     # read data/aligments/let7-perfect.bam
     return True
예제 #10
0
파일: __init__.py 프로젝트: chapmanb/mirtop
def reader(args):
    """
    Realign BAM hits to miRBAse to get better accuracy and annotation
    """
    samples = []
    database = mapper.guess_database(args.gtf)
    args.database = database
    precursors = fasta.read_precursor(args.hairpin, args.sps)
    args.precursors = precursors
    matures = mapper.read_gtf_to_precursor(args.gtf)
    args.matures = matures
    # TODO check numbers of miRNA and precursors read
    # TODO print message if numbers mismatch
    out_dts = dict()
    for fn in args.files:
        if args.format != "gff":
            sample = op.splitext(op.basename(fn))[0]
            samples.append(sample)
            fn_out = op.join(args.out, sample + ".%s" % args.out_format)
        if args.format == "BAM":
            reads = _read_bam(fn, args)
        elif args.format == "seqbuster":
            reads = seqbuster.read_file(fn, args)
        elif args.format == "srnabench":
            out_dts[fn] = srnabench.read_file(fn, args)
        elif args.format == "prost":
            reads = prost.read_file(fn, precursors, database, args.gtf)
        elif args.format == "isomirsea":
            out_dts[fn] = isomirsea.read_file(fn, args)
        elif args.format == "gff":
            samples.extend(header.read_samples(fn))
            out_dts[fn] = body.read(fn, args)
            continue
        if args.format not in ["isomirsea", "srnabench"]:
            ann = annotate(reads, matures, precursors)
            out_dts[fn] = body.create(ann, database, sample, args)
        h = header.create([sample], database, "")
        _write(out_dts[fn], h, fn_out)
    # merge all reads for all samples into one dict
    merged = merge.merge(out_dts, samples)
    fn_merged_out = op.join(args.out, "mirtop.%s" % args.out_format)
    _write(merged, header.create(samples, database, ""), fn_merged_out)
예제 #11
0
    def test_alignment(self):
        """testing alignments function"""
        from mirtop.libs import logger
        logger.initialize_logger("test", True, True)
        logger = logger.getLogger(__name__)
        from mirtop.mirna import fasta, mapper
        precursors = fasta.read_precursor("data/examples/annotate/hairpin.fa",
                                          "hsa")
        matures = mapper.read_gtf_to_precursor(
            "data/examples/annotate/hsa.gff3")

        # matures = mirtop.mirna.read_mature("data/examples/annotate/mirnas.gff", "hsa")
        def annotate(fn, precursors, matures):
            from mirtop.bam import bam
            from mirtop.gff import body
            reads = bam.read_bam(fn, precursors)
            ann = bam.annotate(reads, matures, precursors)
            gff = body.create(ann, "miRBase21", "example", fn + ".gff3", "#")

        print "\nlast1D\n"
        annotate("data/aligments/let7-last1D.sam", precursors, matures)
        #mirna TGAGGTAGTAGGTTGTATAGTT
        #seq   AGAGGTAGTAGGTTGTA
        print "\n1D\n"
        annotate("data/aligments/let7-1D.sam", precursors, matures)
        #mirna TGAGGTAG-TAGGTTGTATAGTT
        #seq   TGAGGTAGGTAGGTTGTATAGTTA
        print "\nlast7M1I\n"
        annotate("data/aligments/let7-last7M1I.sam", precursors, matures)
        #mirna TGAGGTAGTAGGTTGTATAGTT
        #seq   TGAGGTAGTAGGTTGTA-AGT
        print "\nmiddle1D\n"
        annotate("data/aligments/let7-middle1D.sam", precursors, matures)
        #mirna TGAGGTAGTAGGTTGTATAGTT
        #seq   TGAGGTAGTAGGTTGTATAGTT
        print "\nperfect\n"
        annotate("data/aligments/let7-perfect.sam", precursors, matures)
        #mirna TGAGGTAGTAGGTTGTATAGTT
        #seq   TGAGGTAGTAGGTTGTATAG (3tt 3TT)
        print "\ntriming\n"
        annotate("data/aligments/let7-triming.sam", precursors, matures)
예제 #12
0
    def test_srnabench(self):
        """testing reading seqbuster files function"""
        from mirtop.libs import logger
        logger.initialize_logger("test", True, True)
        logger = logger.getLogger(__name__)
        from mirtop.mirna import fasta, mapper
        precursors = fasta.read_precursor("data/examples/annotate/hairpin.fa",
                                          "hsa")
        matures = mapper.read_gtf_to_precursor(
            "data/examples/annotate/hsa.gff3")

        def annotate(fn, precursors, matures):
            from mirtop.importer import srnabench
            from mirtop.bam import bam
            reads = srnabench.read_file(fn, precursors)
            ann = bam.annotate(reads, matures, precursors)
            return True

        print "\nsRNAbench\n"
        annotate("data/examples/srnabench/reads.annotation", precursors,
                 matures)
예제 #13
0
 def test_collapse(self):
     """testing GFF function"""
     from mirtop.libs import logger
     from mirtop.mirna import mapper, fasta
     from mirtop.gff import body, header
     logger.initialize_logger("test", True, True)
     logger = logger.getLogger(__name__)
     precursors = fasta.read_precursor("data/examples/annotate/hairpin.fa",
                                       "hsa")
     # depend on https://github.com/miRTop/mirtop/issues/6
     matures = mapper.read_gtf_to_precursor(
         "data/examples/annotate/hsa.gff3")
     # matures = mirtop.mirna.read_mature("data/examples/annotate/mirnas.gff", "hsa")
     from mirtop.bam import bam
     bam_fn = "data/aligments/collapsing-isomirs.sam"
     reads = bam.read_bam(bam_fn, precursors)
     ann = bam.annotate(reads, matures, precursors)
     fn = bam_fn + ".gff"
     h = header.create(bam_fn, ["example"], "miRBase21")
     gff = body.create(ann, "miRBase21", "example", fn, header)
     print gff
     return True
예제 #14
0
파일: read.py 프로젝트: srinivas32/mirtop
def reader(args):
    """
    Realign BAM hits to miRBase to get better accuracy and annotation
    """
    samples = []
    database = mapper.guess_database(args)
    args.database = database
    precursors = fasta.read_precursor(args.hairpin, args.sps)
    args.precursors = precursors
    matures = mapper.read_gtf_to_precursor(args.gtf)
    args.matures = matures
    # TODO check numbers of miRNA and precursors read
    # TODO print message if numbers mismatch
    if args.keep_name and len(args.files) > 1:
        logger.warning("--keep-name when running multiple samples\n"
                       "can generate wrong results if the\n"
                       "name read is different across sample\n"
                       "for the same sequence.")
    for fn in args.files:
        fn = op.normpath(fn)
        if args.format != "gff":
            sample = op.splitext(op.basename(fn))[0]
            samples.append(sample)
            fn_out = op.join(args.out, sample + ".%s" % args.out_format)
        h = header.create([sample], args.database, "")
        out_handle = open(fn_out, 'w')
        print(h, file=out_handle)
        if args.format == "BAM":
            if args.genomic:
                low_memory_genomic_bam(fn, sample, out_handle, args)
            else:
                low_memory_bam(fn, sample, out_handle, args)
        elif args.format == "seqbuster":
            seqbuster.read_file_low_memory(fn, sample, args, out_handle)
        else:
            raise ValueError("%s not supported for low memory" % args.format)
        out_handle.close()
예제 #15
0
def reader(args):
    """
    Realign BAM hits to miRBAse to get better accuracy and annotation
    """
    database = mapper.guess_database(args.gtf)
    # hairpin, mirna = download_mirbase(args)
    precursors = fasta.read_precursor(args.hairpin, args.sps)
    matures = mapper.read_gtf_to_precursor(args.gtf)
    # check numnbers of miRNA and precursors read
    # print message if numbers mismatch
    out_dts = dict()
    for fn in args.files:
        sample = op.splitext(op.basename(fn))[0]
        fn_out = op.join(args.out, sample + ".gff")
        if args.format == "BAM":
            reads = _read_bam(fn, precursors)
        elif args.format == "seqbuster":
            reads = seqbuster.read_file(fn, precursors)
            custom = seqbuster.header()
        elif args.format == "srnabench":
            reads = srnabench.read_gile(fn, precursors)
        h = header.create([sample], database, "")
        ann = annotate(reads, matures, precursors)
        out_dts[fn] = body.create(ann, database, sample, fn_out, h)
예제 #16
0
def annotate(fn, read_file, load=False, create=True):
    import argparse
    args = argparse.Namespace()
    args.hairpin = "data/examples/annotate/hairpin.fa"
    args.sps = "hsa"
    args.gtf = "data/examples/annotate/hsa.gff3"
    args.add_extra = True
    args.out_format = "gtf"
    from mirtop.mirna import fasta, mapper
    precursors = fasta.read_precursor(args.hairpin, args.sps)
    matures = mapper.read_gtf_to_precursor(args.gtf)
    args.precursors = precursors
    args.matures = matures
    args.database = mapper.guess_database(args.gtf)
    from mirtop.mirna import annotate
    from mirtop.gff import body
    if not load:
        reads = read_file(fn, args)
    else:
        reads = read_file
    if create:
        ann = annotate.annotate(reads, matures, precursors)
        body = body.create(ann, "miRBase21", "Example", args)
    return body
예제 #17
0
def annotate(fn, read_file, load=False, create=True):
    import argparse
    args = argparse.Namespace()
    args.hairpin = "data/examples/annotate/hairpin.fa"
    args.sps = "hsa"
    args.gtf = "data/examples/annotate/hsa.gff3"
    args.add_extra = True
    args.out_format = "gtf"
    from mirtop.mirna import fasta, mapper
    precursors = fasta.read_precursor(args.hairpin, args.sps)
    matures = mapper.read_gtf_to_precursor(args.gtf)
    args.precursors = precursors
    args.matures = matures
    args.database = mapper.guess_database(args.gtf)
    from mirtop.mirna import annotate
    from mirtop.gff import body
    if not load:
        reads = read_file(fn, args)
    else:
        reads = read_file
    if create:
        ann = annotate.annotate(reads, matures, precursors)
        body = body.create(ann, "miRBase21", "Example", args)
    return body
예제 #18
0
파일: prost.py 프로젝트: chapmanb/mirtop
def read_file(fn, hairpins, database, mirna_gtf):
    """
    Read PROST! file and convert to mirtop GFF format.

    Args:
        *fn(str)*: file name with PROST output information.

        *database(str)*: database name.

        *args(namedtuple)*: arguments from command line.
            See *mirtop.libs.parse.add_subparser_gff()*.

    Returns:
        *reads*: dictionary where keys are read_id and values are *mirtop.realign.hits*

    """
    reads = defaultdict(hits)
    sample = os.path.splitext(os.path.basename(fn))[0]
    genomics = mapper.read_gtf_to_mirna(mirna_gtf)
    matures = mapper.read_gtf_to_precursor(mirna_gtf)
    non_mirna = 0
    non_chromosome_mirna = 0
    outside_mirna = 0
    lines_read = 0
    ann, ann_type = _group_seqs_by_ann(fn)
    with open(fn) as handle:
        handle.readline()
        for line in handle:
            lines_read += 1
            cols = line.strip().split("\t")
            query_name = cols[0]
            query_sequence = cols[0]
            if not ann[query_sequence]:
                non_mirna += 1
                continue
            miRNA = ann_type[ann[query_sequence]][1]
            preNames = ann_type[ann[query_sequence]][0]
            if query_name not in reads and query_sequence==None:
                continue
            if query_sequence and query_sequence.find("N") > -1:
                continue
            reads[query_name].set_sequence(query_sequence)
            reads[query_name].counts = cols[9]
            for preName in preNames.split(","):
                if preName in reads[query_name].precursors:
                    continue
                if preName not in hairpins:
                    non_chromosome_mirna += 1
                    continue
                reference_start = _align_to_mature(query_sequence, hairpins[preName], matures[preName][miRNA])
                logger.debug("\nPROST!::NEW::query: {query_sequence}\n"
                             "  precursor {preName}\n"
                             "  name:  {query_name}\n"
                             "  reference_start: {reference_start}\n"
                             "  mirna: {miRNA}".format(**locals()))
                iso = isomir()
                iso.align = line
                iso.set_pos(reference_start, len(reads[query_name].sequence))
                logger.debug("PROST!:: start %s end %s" % (iso.start, iso.end))
                if len(hairpins[preName]) < reference_start + len(reads[query_name].sequence):
                    continue
                iso.subs, iso.add, iso.cigar = filter.tune(
                    reads[query_name].sequence,
                    hairpins[preName],
                    reference_start, None)
                logger.debug("PROST!::After tune start %s end %s" % (
                    iso.start, iso.end))
                if len(iso.subs) < 2:
                    reads[query_name].set_precursor(preName, iso)
    logger.info("Lines loaded: %s" % lines_read)
    logger.info("Skipped lines because non miRNA in line: %s" % non_mirna)
    logger.info("Skipped lines because non chromosome in GTF:"
                " %s" % non_chromosome_mirna)
    logger.info("Skipped lines because outside precursor: %s" % outside_mirna)
    logger.info("Hits: %s" % len(reads))
    return reads
예제 #19
0
                                help="give expression", default=False)
parser.add_option("-p", "--prefix", help="output name")
parser.add_option("--seed", help="set up seed for reproducibility.", default = None)


(options, args) = parser.parse_args()

if options.seed:
    random.seed(options.seed)

full_fq = "%s_full.fq" % options.prefix
clean_fq = "%s_clean.fq" % options.prefix
out_gff = "%s.gff" % options.prefix
if os.path.exists(full_fq):
    os.remove(full_fq)
if os.path.exists(clean_fq):
    os.remove(clean_fq)

pre = fasta.read_precursor(options.fa, "")
mir = mapper.read_gtf_to_precursor(options.gtf)

nt = ['A', 'T', 'G', 'C']
gffs = dict()
h = header.create(["sampleX"], "miRBase1", "")
for precursor in pre:
    seq = pre[precursor]
    gffs.update(create_iso(precursor, mir, seq, options.numsim, options.exp))


_write(gffs, h, out_gff)
예제 #20
0
파일: convert.py 프로젝트: miRTop/mirtop
def convert_gff_counts(args):
    """ Reads a GFF file to produces output file containing Expression counts

    Args:
        *args(namedtuple)*: arguments parsed from command line with
            *mirtop.libs.parse.add_subparser_counts()*.

    Returns:
        *file (file)*: with columns like:
            UID miRNA Variant Sample1 Sample2 ... Sample N
    """
    sep = "\t"
    variant_header = sep.join(['iso_5p', 'iso_3p',
                               'iso_add', 'iso_snp'])
    if args.add_extra:
        precursors = fasta.read_precursor(args.hairpin, args.sps)
        matures = mapper.read_gtf_to_precursor(args.gtf)
        variant_header = sep.join([variant_header,
                                   'iso_5p_nt', 'iso_3p_nt',
                                   'iso_add_nt', 'iso_snp_nt'])

    logger.info("INFO Reading GFF file %s", args.gff)
    logger.info("INFO Writing TSV file to directory %s", args.out)

    gff_file = open(args.gff, 'r')
    out_file = op.join(args.out, "expression_counts.tsv")
    missing_parent = 0
    missing_mirna = 0
    unvalid_uid = 0
    with open(out_file, 'w') as outh:

        for samples_line in gff_file:
            if samples_line.startswith("## COLDATA:"):
                samples = sep.join(samples_line.strip().split("COLDATA:")[1].strip().split(","))
                header = sep.join(['UID', 'Read', 'miRNA', 'Variant',
                                   variant_header, samples])
                print(header, file=outh)
                break

        for mirna_line in gff_file:
            mirna_values = read_gff_line(mirna_line)
            Read = mirna_values["attrb"]["Read"]
            UID = mirna_values["attrb"]["UID"]
            mirna = mirna_values["attrb"]["Name"]
            parent = mirna_values["attrb"]["Parent"]
            variant = mirna_values["attrb"]["Variant"]
            try:
                read_id(UID)
            except KeyError:
                unvalid_uid += 1
                continue

            expression = sep.join(mirna_values["attrb"]["Expression"].strip().split(","))
            cols_variants = sep.join(_expand(variant))
            logger.debug("COUNTS::Read:%s" % Read)
            logger.debug("COUNTS::EXTRA:%s" % variant)
            if args.add_extra:
                if parent not in precursors:
                    missing_parent += 1
                    continue
                if mirna not in matures[parent]:
                    missing_mirna += 1
                    continue
                extra = variant_with_nt(mirna_line, precursors, matures)
                if extra == "Invalid":
                    continue
                logger.debug("COUNTS::EXTRA:%s" % extra)
                cols_variants = sep.join([cols_variants] + _expand(extra, True))
            summary = sep.join([UID, Read,  mirna, variant,
                                cols_variants, expression])
            logger.debug(summary)
            print(summary, file=outh)

    gff_file.close()
    logger.info("Missing Parents in hairpin file: %s" % missing_parent)
    logger.info("Missing MiRNAs in GFF file: %s" % missing_mirna)
    logger.info("Non valid UID: %s" % unvalid_uid)
    logger.info("Output file is at %s" % out_file)
예제 #21
0
def create_vcf(mirgff3, precursor, gtf, vcffile):
    """
    Args:
        'mirgff3(str)': File with mirGFF3 format that will be converted
        'precursor(str)': Fasta format sequences of all miRNA hairpins
        'gtf(str)': Genome coordinates
        'vcffile': name of the file to be saved
    Returns:
        Nothing is returned, instead, a VCF file is generated
    """
    #Check if the input files exist:
    try:
        gff3_file = open(mirgff3, "r", encoding="utf-8") if six.PY3 else open(
            mirgff3, "r")
    except IOError:
        print("Can't read the file", end=mirgff3)
        sys.exit()
    with gff3_file:
        data = gff3_file.read()
        if six.PY2:
            data = data.decode("utf-8-sig").encode("utf-8")

    gff3_data = data.split("\n")
    vcf_file = open(vcffile, "w")

    ver = "v4.3"  # Current VCF version formatting
    vcf_file.write("##fileformat=VCF%s\n" % ver)
    date = datetime.datetime.now().strftime("%Y%m%d")
    vcf_file.write("##fileDate=%s\n" % date)
    source = "\n".join(s for s in gff3_data
                       if "## source-ontology: " in s)[20:]
    line = 0
    sample_names = []
    while gff3_data[line][:2] == "##":
        if gff3_data[line][:19] == "## source-ontology:":
            source = gff3_data[line][20:]
        elif gff3_data[line][:11] == "## COLDATA:":
            sample_names = gff3_data[line][12:].split(",")
        line += 1
    vcf_file.write("##source=%s\n" % source)
    vcf_file.write(
        '##INFO=<ID=NS,Type=Integer,Description="Number of samples"\n')
    vcf_file.write("##FILTER=<ID=REJECT,Description='"
                   'Filter not passed'
                   "'>\n")
    vcf_file.write(
        '##FORMAT=<ID=TRC,Number=1,Type=Integer,Description="Total read count">\n'
    )
    vcf_file.write(
        '##FORMAT=<ID=TSC,Number=1,Type=Integer,Description="Total SNP count">\n'
    )
    vcf_file.write(
        '##FORMAT=<ID=TMC,Number=1,Type=Integer,Description="Total miRNA count">\n'
    )
    vcf_file.write(
        '##FORMAT=<ID=GT,Number=1,Type=Integer,Description="Genotype">\n')
    header = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"
    # Adds Header
    for s in range(len(sample_names)):
        header = header + "\t" + sample_names[s]
    vcf_file.write(header)

    all_dict = dict(
    )  # initializing an empty dictionary where all info will be added
    key_list = [
    ]  # Initializing a list which will contain all the keys of the dictionary
    mirna_dict = dict(
    )  # initializing an empty dictionary where mirna info will be put
    n_SNP = 0
    n_noSNP = 0
    no_var = 0
    hairpins = read_precursor(precursor)
    gff3 = read_gtf_to_precursor(gtf)
    gtf_dic = read_gtf_to_mirna(gtf)
    for line in range(0, len(gff3_data)):
        if not gff3_data[line]:
            continue
        if gff3_data[line][1] == "#":
            continue
        else:  # Parsing the gff3 mirna lecture:
            gff_fields = read_gff_line(gff3_data[line])
            gtf_name = gff_fields['attrb']['Name']
            gtf_parent = gff_fields['attrb']['Parent']
            if gtf_parent not in gff3:
                continue
            if gtf_name not in gff3[gtf_parent]:
                continue
            parent_ini_pos = gff3[gtf_parent][gtf_name][0]
            parent_end_pos = gff3[gtf_parent][gtf_name][1]
            ref_seq = (hairpins[gtf_parent][parent_ini_pos:parent_end_pos + 1])
            vcf_chrom = gtf_dic[gtf_name][gtf_parent][0]
            vcf_pos = int(gff_fields['start']) + int(
                gtf_dic[gtf_name][gtf_parent][1])
            hairpin = hairpins[gtf_parent]
            variants = gff_fields['attrb']['Variant'].split(",")
            logger.debug("VCF::Variant::%s" % variants)
            #  Obtaining the iso_3p, iso_add3p and iso_5p values:

            var3p = [s for s in variants if 'iso_3p' in s]
            if len(var3p):
                var3p = int(var3p[0][7:])  # Position of iso_3p value
            else:
                var3p = 0

            var_add3p = [s for s in variants if 'iso_add3p' in s]
            if len(var_add3p):
                var_add3p = int(
                    var_add3p[0][10:])  # Position of iso_add3p value
            else:
                var_add3p = 0
            var3p = var3p + var_add3p
            logger.debug("VCF::VAR_3p::%s" % var3p)
            var5p = [s for s in variants if 'iso_5p' in s]
            if len(var5p):
                var5p = int(var5p[0][7:])  # Position of iso_5p value
            else:
                var5p = 0  #
            logger.debug("VCF::VAR_5p::%s" % var5p)
            cigar = gff_fields['attrb']["Cigar"]
            # Obtaining all the variants from the cigar:
            if 1:
                (key_pos, key_var, vcf_ref, vcf_alt) = cigar_2_key(
                    cigar, gff_fields['attrb']['Read'], ref_seq, vcf_pos,
                    var5p, var3p, parent_ini_pos, parent_end_pos, hairpin)

                # Adding the variants to a dictionary and calculating all the fields of a vcf file format:
                if len(key_var) > 0:
                    for s in range(len(key_var)):
                        key_dict = vcf_chrom + '-' + str(
                            key_pos[s]) + '-' + str(key_var[s])
                        raw_counts = gff_fields['attrb']['Expression']
                        raw_counts = [int(i) for i in raw_counts.split(',')]
                        nozero_counts = [
                            int(i > 0) for i in raw_counts
                        ]  # counts for every sample if expr != 0.
                        if gtf_name in mirna_dict:  # Adding expression values to same mirnas
                            mirna_dict[gtf_name]['Z'] = [
                                sum(x) for x in zip(mirna_dict[gtf_name]['Z'],
                                                    raw_counts)
                            ]
                        else:
                            mirna_dict[gtf_name] = {}
                            mirna_dict[gtf_name]["Z"] = raw_counts
                        if key_dict in all_dict:
                            if all_dict[key_dict]["Type"] in [
                                    "A", "C", "T", "G"
                            ]:
                                all_dict[key_dict]['X'] = [
                                    sum(x) for x in zip(
                                        all_dict[key_dict]['X'], nozero_counts)
                                ]
                                all_dict[key_dict]['Y'] = [
                                    sum(x) for x in zip(
                                        all_dict[key_dict]['Y'], raw_counts)
                                ]
                        else:
                            key_list.append(key_dict)
                            all_dict[key_dict] = {}
                            all_dict[key_dict]["Chrom"] = vcf_chrom
                            all_dict[key_dict]["Position"] = key_pos[s]
                            all_dict[key_dict]["mirna"] = gtf_name
                            all_dict[key_dict]["Type"] = key_var[s]
                            if key_var[s][0] in ["A", "C", "T", "G"]:
                                n_SNP += 1
                                all_dict[key_dict]["SNP"] = True
                                all_dict[key_dict]["ID"] = gff_fields['attrb'][
                                    'Name'] + '-SNP' + str(n_SNP)
                                all_dict[key_dict]['X'] = nozero_counts
                                all_dict[key_dict]['Y'] = raw_counts
                            else:
                                n_noSNP += 1
                                all_dict[key_dict]["SNP"] = False
                                all_dict[key_dict]["ID"] = gff_fields['attrb'][
                                    'Name'] + '-nonSNP' + str(n_noSNP)
                            all_dict[key_dict]["Ref"] = vcf_ref[s]
                            all_dict[key_dict]["Alt"] = vcf_alt[s]
                            all_dict[key_dict]["Qual"] = "."
                            all_dict[key_dict]["Filter"] = gff_fields['attrb'][
                                'Filter']
                            all_dict[key_dict]["Info"] = "NS=" + str(
                                len(sample_names))
            else:
                no_var += 1

    #  Writing the VCF file:
    for s in key_list:
        variant_line = (
            "\n%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" %
            (all_dict[s]["Chrom"], all_dict[s]["Position"], all_dict[s]["ID"],
             all_dict[s]["Ref"], all_dict[s]["Alt"], all_dict[s]["Qual"],
             all_dict[s]["Filter"], all_dict[s]["Info"]))
        if all_dict[s]["Type"] in ["A", "T", "C", "G"]:
            format_col = "TRC:TSC:TMC:GT"
            variant_line = variant_line + "\t" + format_col
            samples = ""
            for n in range(len(sample_names)):
                X = all_dict[s]["X"][n]
                Y = all_dict[s]["Y"][n]
                Z = mirna_dict[all_dict[s]["mirna"]]["Z"][n]
                # Calculating the genotype:
                if Y == 0:
                    GT = "0|0"
                elif Z == Y:
                    GT = "1|1"
                else:
                    GT = "1|0"
                samples = samples + "\t" + str(X) + ":" + str(Y) + ":" + str(
                    Z) + ":" + GT
            variant_line = variant_line + samples
        else:
            format_col = ""
            variant_line = variant_line + format_col
        vcf_file.write(variant_line)
    vcf_file.close()
예제 #22
0
                  default=False)
parser.add_option("-p", "--prefix", help="output name")
parser.add_option("--seed",
                  help="set up seed for reproducibility.",
                  default=None)

(options, args) = parser.parse_args()

if options.seed:
    random.seed(options.seed)

full_fq = "%s_full.fq" % options.prefix
clean_fq = "%s_clean.fq" % options.prefix
out_gff = "%s.gff" % options.prefix
if os.path.exists(full_fq):
    os.remove(full_fq)
if os.path.exists(clean_fq):
    os.remove(clean_fq)

pre = fasta.read_precursor(options.fa, "")
mir = mapper.read_gtf_to_precursor(options.gtf)

nt = ['A', 'T', 'G', 'C']
gffs = dict()
h = header.create(["sampleX"], "miRBase1", "")
for precursor in pre:
    seq = pre[precursor]
    gffs.update(create_iso(precursor, mir, seq, options.numsim, options.exp))

_write(gffs, h, out_gff)
예제 #23
0
def convert_gff_counts(args):
    """ Reads a GFF file to produces output file containing Expression counts

    Args:
        *args(namedtuple)*: arguments parsed from command line with
            *mirtop.libs.parse.add_subparser_counts()*.

    Returns:
        *file (file)*: with columns like:
            UID miRNA Variant Sample1 Sample2 ... Sample N
    """
    sep = "\t"
    variant_header = sep.join(['iso_5p', 'iso_3p', 'iso_add3p', 'iso_snp'])
    if args.add_extra:
        precursors = fasta.read_precursor(args.hairpin, args.sps)
        matures = mapper.read_gtf_to_precursor(args.gtf)
        variant_header = sep.join([
            variant_header, 'iso_5p_nt', 'iso_3p_nt', 'iso_add3p_nt',
            'iso_snp_nt'
        ])

    logger.info("INFO Reading GFF file %s", args.gff)
    logger.info("INFO Writing TSV file to directory %s", args.out)

    gff_file = open(args.gff, 'r')
    out_file = op.join(args.out,
                       "%s.tsv" % op.splitext(op.basename(args.gff))[0])
    missing_parent = 0
    missing_mirna = 0
    unvalid_uid = 0
    with open(out_file, 'w') as outh:

        for samples_line in gff_file:
            if samples_line.startswith("## COLDATA:"):
                samples = sep.join(samples_line.strip().split("COLDATA:")
                                   [1].strip().split(","))
                header = sep.join([
                    'UID', 'Read', 'miRNA', 'Variant', variant_header, samples
                ])
                print(header, file=outh)
                break

        for mirna_line in gff_file:
            gff = feature(mirna_line)
            attr = gff.attributes
            UID = attr["UID"]
            Read = attr["Read"]
            mirna = attr["Name"]
            parent = attr["Parent"]
            variant = attr["Variant"]
            try:
                read_id(UID)
            except KeyError:
                unvalid_uid += 1
                continue

            expression = sep.join(attr["Expression"].strip().split(","))
            cols_variants = sep.join(_expand(variant))
            logger.debug("COUNTS::Read:%s" % Read)
            logger.debug("COUNTS::EXTRA:%s" % variant)
            if args.add_extra:
                if parent not in precursors:
                    missing_parent += 1
                    continue
                if mirna not in matures[parent]:
                    missing_mirna += 1
                    continue
                extra = variant_with_nt(mirna_line, precursors, matures)
                if extra == "Invalid":
                    continue
                logger.debug("COUNTS::EXTRA:%s" % extra)
                cols_variants = sep.join([cols_variants] +
                                         _expand(extra, True))
            summary = sep.join(
                [UID, Read, mirna, variant, cols_variants, expression])
            logger.debug(summary)
            print(summary, file=outh)

    gff_file.close()
    logger.info("Missing Parents in hairpin file: %s" % missing_parent)
    logger.info("Missing MiRNAs in GFF file: %s" % missing_mirna)
    logger.info("Non valid UID: %s" % unvalid_uid)
    logger.info("Output file is at %s" % out_file)
예제 #24
0
 def test_read_hairpin_mirgenedb(self):
     from mirtop.mirna import mapper
     from mirtop.libs import logger
     logger.initialize_logger("test_read_files", True, True)
     map_mir = mapper.read_gtf_to_precursor("data/db/mirgenedb/hsa.gff")
     print(map_mir)