def _download_mirbase(args, version="CURRENT"): """ Download files from mirbase """ if not args.hairpin or not args.mirna: logger.info("Working with version %s" % version) hairpin_fn = op.join(op.abspath(args.out), "hairpin.fa.gz") mirna_fn = op.join(op.abspath(args.out), "miRNA.str.gz") if not file_exists(hairpin_fn): cmd_h = "wget ftp://mirbase.org/pub/mirbase/%s/hairpin.fa.gz -O %s && gunzip -f !$" % ( version, hairpin_fn) do.run(cmd_h, "download hairpin") if not file_exists(mirna_fn): cmd_m = "wget ftp://mirbase.org/pub/mirbase/%s/miRNA.str.gz -O %s && gunzip -f !$" % ( version, mirna_fn) do.run(cmd_m, "download mirna") else: return args.hairpin, args.mirna
def miraligner(args): """ Realign BAM hits to miRBAse to get better accuracy and annotation """ hairpin, mirna = _download_mirbase(args) precursors = _read_precursor(args.hairpin, args.sps) matures = _read_mature(args.mirna, args.sps) gtf = _read_gtf(args.gtf) out_dts = [] for bam_fn in args.files: sample = op.splitext(op.basename(bam_fn))[0] if bam_fn.endswith("bam") or bam_fn.endswith("sam"): logger.info("Reading %s" % bam_fn) bam_fn = _sam_to_bam(bam_fn) bam_sort_by_n = _bam_sort(bam_fn) reads = _read_bam(bam_sort_by_n, precursors) elif bam_fn.endswith("fasta") or bam_fn.endswith( "fa") or bam_fn.endswith("fastq"): out_file = op.join(args.out, sample + ".premirna") bam_fn = _filter_seqs(bam_fn) if args.miraligner: _cmd_miraligner(bam_fn, out_file, args.sps, args.hairpin) reads = _read_miraligner(out_file) else: logger.error("No other aligner supported") raise ValueError("Specify --miraligner or give BAM files") else: raise ValueError("Format not recognized.") if not args.miraligner: reads = _annotate(reads, matures, precursors) out_file = op.join(args.out, sample + ".mirna") out_file, dt, dt_pre = _tab_output(reads, out_file, sample) try: vcf_file = op.join(args.out, sample + ".vcf") if not file_exists(vcf_file): # if True: create_vcf(dt_pre, matures, gtf, vcf_file) try: import vcf vcf.Reader(filename=vcf_file) except Exception as e: logger.warning(e.__doc__) logger.warning(e.message) except Exception as e: # traceback.print_exc() logger.warning(e.__doc__) logger.warning(e.message) if isinstance(dt, pd.DataFrame): out_dts.append(dt) if out_dts: _create_counts(out_dts, args.out) # _summarize(out_dts) else: print "No files analyzed!"
def _cmd_miraligner(fn, out_file, species, hairpin): """ Run miraligner for miRNA annotation """ tool = _get_miraligner() path_db = op.dirname(op.abspath(hairpin)) opts = "-Xms750m -Xmx4g" cmd = "{tool} -freq -i {fn} -o {out_file} -s {species} -db {path_db} -sub 1 -trim 3 -add 3" if not file_exists(out_file): do.run(cmd.format(**locals()), "miraligner with %s" % fn) shutil.move(out_file + ".mirna", out_file) return out_file
def _filter_seqs(fn): """Convert names of sequences to unique ids""" out_file = op.splitext(fn)[0] + "_unique.fa" idx = 0 if not file_exists(out_file): with open(out_file, 'w') as out_handle: with open(fn) as in_handle: for line in in_handle: if line.startswith("@") or line.startswith(">"): fixed_name = _make_unique(line.strip(), idx) seq = in_handle.next().strip() counts = _get_freq(fixed_name) if len(seq) < 26 and (counts > 1 or counts == 0): idx += 1 print >> out_handle, fixed_name print >> out_handle, seq try: if line.startswith("@"): in_handle.next() in_handle.next() except: pass return out_file
def _bam_sort(bam_fn): bam_sort_by_n = op.splitext(bam_fn)[0] + "_sort.bam" if not file_exists(bam_sort_by_n): do.run(("samtools sort -n -o {bam_sort_by_n} {bam_fn}").format( **locals())) return bam_sort_by_n