Пример #1
0
def reader(args):
    """
    Realign BAM hits to miRBAse to get better accuracy and annotation
    """
    if args.low_memory:
        read.reader(args)
        return None
    samples = []
    database = mapper.guess_database(args)
    args.database = database
    precursors = fasta.read_precursor(args.hairpin, args.sps)
    args.precursors = precursors
    matures = mapper.read_gtf_to_precursor(args.gtf)
    args.matures = matures
    # TODO check numbers of miRNA and precursors read
    # TODO print message if numbers mismatch
    out_dts = dict()
    if args.keep_name and len(args.files) > 1:
        logger.warning("--keep-name when running multiple samples\n"
                       "can generate wrong results if the\n"
                       "name read is different across sample\n"
                       "for the same sequence.")
    for fn in args.files:
        fn = op.normpath(fn)
        if args.format != "gff":
            sample = op.splitext(op.basename(fn))[0]
            samples.append(sample)
            fn_out = op.join(args.out, sample + ".%s" % args.out_format)
        if args.format == "BAM":
            reads = _read_bam(fn, args)
        elif args.format == "seqbuster":
            reads = seqbuster.read_file(fn, args)
        elif args.format == "srnabench":
            out_dts[fn] = srnabench.read_file(fn, args)
        elif args.format == "prost":
            reads = prost.read_file(fn, precursors, database, args.gtf)
        elif args.format == "isomirsea":
            out_dts[fn] = isomirsea.read_file(fn, args)
        elif args.format == "manatee":
            out_dts[fn] = manatee.read_file(fn, database, args)
        elif args.format == "optimir":
            out_dts[fn] = optimir.read_file(fn, args)
        elif args.format == "gff":
            samples.extend(header.read_samples(fn))
            out_dts[fn] = body.read(fn, args)
            continue
        if args.format not in ["isomirsea", "srnabench", "manatee", 'optimir']:
            ann = annotate(reads, matures, precursors)
            out_dts[fn] = body.create(ann, database, sample, args)
        h = header.create([sample], database, header.make_tools(args.format))
        _write(out_dts[fn], h, fn_out, args)
    # merge all reads for all samples into one dict
    if args.low_memory:
        return None
    merged = merge.merge(out_dts, samples)
    fn_merged_out = op.join(args.out, "mirtop.%s" % args.out_format)
    _write(merged,
           header.create(samples, database, header.make_tools([args.format])),
           fn_merged_out, args)
Пример #2
0
def reader(args):
    """
    Realign BAM hits to miRBAse to get better accuracy and annotation
    """
    samples = []
    database = mapper.guess_database(args.gtf)
    args.database = database
    precursors = fasta.read_precursor(args.hairpin, args.sps)
    args.precursors = precursors
    matures = mapper.read_gtf_to_precursor(args.gtf)
    args.matures = matures
    # TODO check numbers of miRNA and precursors read
    # TODO print message if numbers mismatch
    out_dts = dict()
    for fn in args.files:
        if args.format != "gff":
            sample = op.splitext(op.basename(fn))[0]
            samples.append(sample)
            fn_out = op.join(args.out, sample + ".%s" % args.out_format)
        if args.format == "BAM":
            reads = _read_bam(fn, args)
        elif args.format == "seqbuster":
            reads = seqbuster.read_file(fn, args)
        elif args.format == "srnabench":
            out_dts[fn] = srnabench.read_file(fn, args)
        elif args.format == "prost":
            reads = prost.read_file(fn, precursors, database, args.gtf)
        elif args.format == "isomirsea":
            out_dts[fn] = isomirsea.read_file(fn, args)
        elif args.format == "gff":
            samples.extend(header.read_samples(fn))
            out_dts[fn] = body.read(fn, args)
            continue
        if args.format not in ["isomirsea", "srnabench"]:
            ann = annotate(reads, matures, precursors)
            out_dts[fn] = body.create(ann, database, sample, args)
        h = header.create([sample], database, "")
        _write(out_dts[fn], h, fn_out)
    # merge all reads for all samples into one dict
    merged = merge.merge(out_dts, samples)
    fn_merged_out = op.join(args.out, "mirtop.%s" % args.out_format)
    _write(merged, header.create(samples, database, ""), fn_merged_out)
Пример #3
0
 def test_collapse(self):
     """testing GFF function"""
     from mirtop.libs import logger
     from mirtop.mirna import mapper, fasta
     from mirtop.gff import body, header
     logger.initialize_logger("test", True, True)
     logger = logger.getLogger(__name__)
     precursors = fasta.read_precursor("data/examples/annotate/hairpin.fa",
                                       "hsa")
     # depend on https://github.com/miRTop/mirtop/issues/6
     matures = mapper.read_gtf_to_precursor(
         "data/examples/annotate/hsa.gff3")
     # matures = mirtop.mirna.read_mature("data/examples/annotate/mirnas.gff", "hsa")
     from mirtop.bam import bam
     bam_fn = "data/aligments/collapsing-isomirs.sam"
     reads = bam.read_bam(bam_fn, precursors)
     ann = bam.annotate(reads, matures, precursors)
     fn = bam_fn + ".gff"
     h = header.create(bam_fn, ["example"], "miRBase21")
     gff = body.create(ann, "miRBase21", "example", fn, header)
     print gff
     return True
Пример #4
0
def reader(args):
    """
    Realign BAM hits to miRBase to get better accuracy and annotation
    """
    samples = []
    database = mapper.guess_database(args)
    args.database = database
    precursors = fasta.read_precursor(args.hairpin, args.sps)
    args.precursors = precursors
    matures = mapper.read_gtf_to_precursor(args.gtf)
    args.matures = matures
    # TODO check numbers of miRNA and precursors read
    # TODO print message if numbers mismatch
    if args.keep_name and len(args.files) > 1:
        logger.warning("--keep-name when running multiple samples\n"
                       "can generate wrong results if the\n"
                       "name read is different across sample\n"
                       "for the same sequence.")
    for fn in args.files:
        fn = op.normpath(fn)
        if args.format != "gff":
            sample = op.splitext(op.basename(fn))[0]
            samples.append(sample)
            fn_out = op.join(args.out, sample + ".%s" % args.out_format)
        h = header.create([sample], args.database, "")
        out_handle = open(fn_out, 'w')
        print(h, file=out_handle)
        if args.format == "BAM":
            if args.genomic:
                low_memory_genomic_bam(fn, sample, out_handle, args)
            else:
                low_memory_bam(fn, sample, out_handle, args)
        elif args.format == "seqbuster":
            seqbuster.read_file_low_memory(fn, sample, args, out_handle)
        else:
            raise ValueError("%s not supported for low memory" % args.format)
        out_handle.close()
Пример #5
0
def reader(args):
    """
    Realign BAM hits to miRBAse to get better accuracy and annotation
    """
    database = mapper.guess_database(args.gtf)
    # hairpin, mirna = download_mirbase(args)
    precursors = fasta.read_precursor(args.hairpin, args.sps)
    matures = mapper.read_gtf_to_precursor(args.gtf)
    # check numnbers of miRNA and precursors read
    # print message if numbers mismatch
    out_dts = dict()
    for fn in args.files:
        sample = op.splitext(op.basename(fn))[0]
        fn_out = op.join(args.out, sample + ".gff")
        if args.format == "BAM":
            reads = _read_bam(fn, precursors)
        elif args.format == "seqbuster":
            reads = seqbuster.read_file(fn, precursors)
            custom = seqbuster.header()
        elif args.format == "srnabench":
            reads = srnabench.read_gile(fn, precursors)
        h = header.create([sample], database, "")
        ann = annotate(reads, matures, precursors)
        out_dts[fn] = body.create(ann, database, sample, fn_out, h)
Пример #6
0
                  default=False)
parser.add_option("-p", "--prefix", help="output name")
parser.add_option("--seed",
                  help="set up seed for reproducibility.",
                  default=None)

(options, args) = parser.parse_args()

if options.seed:
    random.seed(options.seed)

full_fq = "%s_full.fq" % options.prefix
clean_fq = "%s_clean.fq" % options.prefix
out_gff = "%s.gff" % options.prefix
if os.path.exists(full_fq):
    os.remove(full_fq)
if os.path.exists(clean_fq):
    os.remove(clean_fq)

pre = fasta.read_precursor(options.fa, "")
mir = mapper.read_gtf_to_precursor(options.gtf)

nt = ['A', 'T', 'G', 'C']
gffs = dict()
h = header.create(["sampleX"], "miRBase1", "")
for precursor in pre:
    seq = pre[precursor]
    gffs.update(create_iso(precursor, mir, seq, options.numsim, options.exp))

_write(gffs, h, out_gff)
Пример #7
0
                                help="give expression", default=False)
parser.add_option("-p", "--prefix", help="output name")
parser.add_option("--seed", help="set up seed for reproducibility.", default = None)


(options, args) = parser.parse_args()

if options.seed:
    random.seed(options.seed)

full_fq = "%s_full.fq" % options.prefix
clean_fq = "%s_clean.fq" % options.prefix
out_gff = "%s.gff" % options.prefix
if os.path.exists(full_fq):
    os.remove(full_fq)
if os.path.exists(clean_fq):
    os.remove(clean_fq)

pre = fasta.read_precursor(options.fa, "")
mir = mapper.read_gtf_to_precursor(options.gtf)

nt = ['A', 'T', 'G', 'C']
gffs = dict()
h = header.create(["sampleX"], "miRBase1", "")
for precursor in pre:
    seq = pre[precursor]
    gffs.update(create_iso(precursor, mir, seq, options.numsim, options.exp))


_write(gffs, h, out_gff)