def reader(args): """ Realign BAM hits to miRBAse to get better accuracy and annotation """ if args.low_memory: read.reader(args) return None samples = [] database = mapper.guess_database(args) args.database = database precursors = fasta.read_precursor(args.hairpin, args.sps) args.precursors = precursors matures = mapper.read_gtf_to_precursor(args.gtf) args.matures = matures # TODO check numbers of miRNA and precursors read # TODO print message if numbers mismatch out_dts = dict() if args.keep_name and len(args.files) > 1: logger.warning("--keep-name when running multiple samples\n" "can generate wrong results if the\n" "name read is different across sample\n" "for the same sequence.") for fn in args.files: fn = op.normpath(fn) if args.format != "gff": sample = op.splitext(op.basename(fn))[0] samples.append(sample) fn_out = op.join(args.out, sample + ".%s" % args.out_format) if args.format == "BAM": reads = _read_bam(fn, args) elif args.format == "seqbuster": reads = seqbuster.read_file(fn, args) elif args.format == "srnabench": out_dts[fn] = srnabench.read_file(fn, args) elif args.format == "prost": reads = prost.read_file(fn, precursors, database, args.gtf) elif args.format == "isomirsea": out_dts[fn] = isomirsea.read_file(fn, args) elif args.format == "manatee": out_dts[fn] = manatee.read_file(fn, database, args) elif args.format == "optimir": out_dts[fn] = optimir.read_file(fn, args) elif args.format == "gff": samples.extend(header.read_samples(fn)) out_dts[fn] = body.read(fn, args) continue if args.format not in ["isomirsea", "srnabench", "manatee", 'optimir']: ann = annotate(reads, matures, precursors) out_dts[fn] = body.create(ann, database, sample, args) h = header.create([sample], database, header.make_tools(args.format)) _write(out_dts[fn], h, fn_out, args) # merge all reads for all samples into one dict if args.low_memory: return None merged = merge.merge(out_dts, samples) fn_merged_out = op.join(args.out, "mirtop.%s" % args.out_format) _write(merged, header.create(samples, database, header.make_tools([args.format])), fn_merged_out, args)
def reader(args): """ Realign BAM hits to miRBAse to get better accuracy and annotation """ samples = [] database = mapper.guess_database(args.gtf) args.database = database precursors = fasta.read_precursor(args.hairpin, args.sps) args.precursors = precursors matures = mapper.read_gtf_to_precursor(args.gtf) args.matures = matures # TODO check numbers of miRNA and precursors read # TODO print message if numbers mismatch out_dts = dict() for fn in args.files: if args.format != "gff": sample = op.splitext(op.basename(fn))[0] samples.append(sample) fn_out = op.join(args.out, sample + ".%s" % args.out_format) if args.format == "BAM": reads = _read_bam(fn, args) elif args.format == "seqbuster": reads = seqbuster.read_file(fn, args) elif args.format == "srnabench": out_dts[fn] = srnabench.read_file(fn, args) elif args.format == "prost": reads = prost.read_file(fn, precursors, database, args.gtf) elif args.format == "isomirsea": out_dts[fn] = isomirsea.read_file(fn, args) elif args.format == "gff": samples.extend(header.read_samples(fn)) out_dts[fn] = body.read(fn, args) continue if args.format not in ["isomirsea", "srnabench"]: ann = annotate(reads, matures, precursors) out_dts[fn] = body.create(ann, database, sample, args) h = header.create([sample], database, "") _write(out_dts[fn], h, fn_out) # merge all reads for all samples into one dict merged = merge.merge(out_dts, samples) fn_merged_out = op.join(args.out, "mirtop.%s" % args.out_format) _write(merged, header.create(samples, database, ""), fn_merged_out)
def test_collapse(self): """testing GFF function""" from mirtop.libs import logger from mirtop.mirna import mapper, fasta from mirtop.gff import body, header logger.initialize_logger("test", True, True) logger = logger.getLogger(__name__) precursors = fasta.read_precursor("data/examples/annotate/hairpin.fa", "hsa") # depend on https://github.com/miRTop/mirtop/issues/6 matures = mapper.read_gtf_to_precursor( "data/examples/annotate/hsa.gff3") # matures = mirtop.mirna.read_mature("data/examples/annotate/mirnas.gff", "hsa") from mirtop.bam import bam bam_fn = "data/aligments/collapsing-isomirs.sam" reads = bam.read_bam(bam_fn, precursors) ann = bam.annotate(reads, matures, precursors) fn = bam_fn + ".gff" h = header.create(bam_fn, ["example"], "miRBase21") gff = body.create(ann, "miRBase21", "example", fn, header) print gff return True
def reader(args): """ Realign BAM hits to miRBase to get better accuracy and annotation """ samples = [] database = mapper.guess_database(args) args.database = database precursors = fasta.read_precursor(args.hairpin, args.sps) args.precursors = precursors matures = mapper.read_gtf_to_precursor(args.gtf) args.matures = matures # TODO check numbers of miRNA and precursors read # TODO print message if numbers mismatch if args.keep_name and len(args.files) > 1: logger.warning("--keep-name when running multiple samples\n" "can generate wrong results if the\n" "name read is different across sample\n" "for the same sequence.") for fn in args.files: fn = op.normpath(fn) if args.format != "gff": sample = op.splitext(op.basename(fn))[0] samples.append(sample) fn_out = op.join(args.out, sample + ".%s" % args.out_format) h = header.create([sample], args.database, "") out_handle = open(fn_out, 'w') print(h, file=out_handle) if args.format == "BAM": if args.genomic: low_memory_genomic_bam(fn, sample, out_handle, args) else: low_memory_bam(fn, sample, out_handle, args) elif args.format == "seqbuster": seqbuster.read_file_low_memory(fn, sample, args, out_handle) else: raise ValueError("%s not supported for low memory" % args.format) out_handle.close()
def reader(args): """ Realign BAM hits to miRBAse to get better accuracy and annotation """ database = mapper.guess_database(args.gtf) # hairpin, mirna = download_mirbase(args) precursors = fasta.read_precursor(args.hairpin, args.sps) matures = mapper.read_gtf_to_precursor(args.gtf) # check numnbers of miRNA and precursors read # print message if numbers mismatch out_dts = dict() for fn in args.files: sample = op.splitext(op.basename(fn))[0] fn_out = op.join(args.out, sample + ".gff") if args.format == "BAM": reads = _read_bam(fn, precursors) elif args.format == "seqbuster": reads = seqbuster.read_file(fn, precursors) custom = seqbuster.header() elif args.format == "srnabench": reads = srnabench.read_gile(fn, precursors) h = header.create([sample], database, "") ann = annotate(reads, matures, precursors) out_dts[fn] = body.create(ann, database, sample, fn_out, h)
default=False) parser.add_option("-p", "--prefix", help="output name") parser.add_option("--seed", help="set up seed for reproducibility.", default=None) (options, args) = parser.parse_args() if options.seed: random.seed(options.seed) full_fq = "%s_full.fq" % options.prefix clean_fq = "%s_clean.fq" % options.prefix out_gff = "%s.gff" % options.prefix if os.path.exists(full_fq): os.remove(full_fq) if os.path.exists(clean_fq): os.remove(clean_fq) pre = fasta.read_precursor(options.fa, "") mir = mapper.read_gtf_to_precursor(options.gtf) nt = ['A', 'T', 'G', 'C'] gffs = dict() h = header.create(["sampleX"], "miRBase1", "") for precursor in pre: seq = pre[precursor] gffs.update(create_iso(precursor, mir, seq, options.numsim, options.exp)) _write(gffs, h, out_gff)
help="give expression", default=False) parser.add_option("-p", "--prefix", help="output name") parser.add_option("--seed", help="set up seed for reproducibility.", default = None) (options, args) = parser.parse_args() if options.seed: random.seed(options.seed) full_fq = "%s_full.fq" % options.prefix clean_fq = "%s_clean.fq" % options.prefix out_gff = "%s.gff" % options.prefix if os.path.exists(full_fq): os.remove(full_fq) if os.path.exists(clean_fq): os.remove(clean_fq) pre = fasta.read_precursor(options.fa, "") mir = mapper.read_gtf_to_precursor(options.gtf) nt = ['A', 'T', 'G', 'C'] gffs = dict() h = header.create(["sampleX"], "miRBase1", "") for precursor in pre: seq = pre[precursor] gffs.update(create_iso(precursor, mir, seq, options.numsim, options.exp)) _write(gffs, h, out_gff)