def _sam_to_bam(bam_fn): if bam_fn.endswith("bam"): bam_out = "%s.bam" % os.path.splitext(bam_fn)[0] cmd = "samtools view -Sbh {bam_fn} -o {bam_out}" do.run(cmd) return bam_out return bam_fn
def _create_clusters(seqL, bam_file, args): """ Cluster sequences and create metaclusters with multi-mappers. """ clus_obj = [] cluster_file = op.join(args.out, "cluster.bed") if not os.path.exists(op.join(args.out, 'list_obj.pk')): if not file_exists(cluster_file): logger.info("Parsing aligned file") logger.info("Merging sequences") bedtools = os.path.join(os.path.dirname(sys.executable), "bedtools") bedtools = bedtools if os.path.exists(bedtools) else "bedtools" parse_cmd = "awk '{i=i+1;print $1\"\\t\"$2\"\\t\"$3\"\\t\"$4\"\\t\"i\"\\t\"$6}'" cmd = "{bedtools} bamtobed -i {bam_file} | {parse_cmd} | {bedtools} cluster -s -d 20 -i - > {cluster_file}" do.run(cmd.format(**locals())) c = pybedtools.BedTool(cluster_file) logger.info("Creating clusters") clus_obj = detect_clusters(c, seqL, args.min_seqs, args.non_un_gl) with open(op.join(args.out, 'list_obj.pk'), 'wb') as output: pickle.dump(clus_obj, output, pickle.HIGHEST_PROTOCOL) else: logger.info("Loading previous clusters") with open(op.join(args.out, 'list_obj.pk'), 'rb') as input: clus_obj = pickle.load(input) # bedfile = pybedtools.BedTool(generate_position_bed(clus_obj), from_string=True) # seqs_2_loci = bedfile.intersect(pybedtools.BedTool(aligned_bed, from_string=True), wo=True, s=True) # seqs_2_position = add_seqs_position_to_loci(seqs_2_loci, seqL) logger.info("%s clusters found" % (len(clus_obj.clusid))) return clus_obj
def _create_clusters(seqL, bam_file, args): """ Cluster sequences and create metaclusters with multi-mappers. """ clus_obj = [] cluster_file = op.join(args.out, "cluster.bed") if not os.path.exists(op.join(args.out, 'list_obj.pk')): if not file_exists(cluster_file): logger.info("Parsing aligned file") logger.info("Merging sequences") bedtools = os.path.join(os.path.dirname(sys.executable), "bedtools") bedtools = bedtools if os.path.exists(bedtools) else "bedtools" parse_cmd = "awk '{i=i+1;print $1\"\\t\"$2\"\\t\"$3\"\\t\"$4\"\\t\"i\"\\t\"$6}'" cmd = "{bedtools} bamtobed -i {bam_file} | {parse_cmd} | {bedtools} cluster -s -d 20 -i - > {cluster_file}" do.run(cmd.format(**locals())) c = pybedtools.BedTool(cluster_file) logger.info("Creating clusters") clus_obj = detect_clusters(c, seqL, args.min_seqs, args.non_un_gl) with open(op.join(args.out, 'list_obj.pk'), 'wb') as output: pickle.dump(clus_obj, output, pickle.HIGHEST_PROTOCOL) else: logger.info("Loading previous clusters") with open(op.join(args.out, 'list_obj.pk'), 'rb') as input: clus_obj = pickle.load(input) # bedfile = pybedtools.BedTool(generate_position_bed(clus_obj), from_string=True) # seqs_2_loci = bedfile.intersect(pybedtools.BedTool(aligned_bed, from_string=True), wo=True, s=True) # seqs_2_position = add_seqs_position_to_loci(seqs_2_loci, seqL) logger.info("%s clusters found" % (len(clus_obj.clusid))) return clus_obj
def clean_bam_file(bam_in, mask=None): """ Remove from alignment reads with low counts and highly # of hits """ seq_obj = defaultdict(int) if mask: mask_file = op.splitext(bam_in)[0] + "_mask.bam" if not file_exists(mask_file): pybedtools.BedTool(bam_file).intersect(b=mask, v=True).saveas(mask_file) bam_in = mask_file out_file = op.splitext(bam_in)[0] + "_rmlw.bam" # bam.index(bam_in, {'algorithm':{}}) run("samtools index %s" % bam_in) if not file_exists(bam_in + ".bai"): raise IOError("Failed to created bam index of %s. Try to do it manually" % bam_in) bam_handle = pysam.AlignmentFile(bam_in, "rb") with pysam.AlignmentFile(out_file, "wb", template=bam_handle) as out_handle: for read in bam_handle.fetch(): seq_name = int(read.query_name.replace('seq_', '')) match_size = [nts for oper, nts in read.cigartuples if oper == 0] subs_size = [nts for oper, nts in read.cigartuples if oper == 4] if match_size[0] < 17: continue if subs_size: if subs_size[0] > 3: continue try: nh = read.get_tag('NH') except KeyError: nh = 1 seq_obj[seq_name] = sequence(seq_name) seq_obj[seq_name].align = nh out_handle.write(read) return out_file, seq_obj
def _sam_to_bam(bam_fn): if bam_fn.endswith("bam"): bam_out = "%s.bam" % os.path.splitext(bam_fn)[0] cmd = "samtools view -Sbh {bam_fn} -o {bam_out}" do.run(cmd) return bam_out return bam_fn
def _run_tRNA_scan(fasta_file): """ Run tRNA-scan-SE to predict tRNA """ out_file = fasta_file + "_trnascan" se_file = fasta_file + "_second_str" cmd = "tRNAscan-SE -q -o {out_file} -f {se_file} {fasta_file}" run(cmd.format(**locals())) return out_file, se_file
def _run_tRNA_scan(fasta_file): """ Run tRNA-scan-SE to predict tRNA """ out_file = fasta_file + "_trnascan" se_file = fasta_file + "_second_str" cmd = "tRNAscan-SE -q -o {out_file} -f {se_file} {fasta_file}" run(cmd.format(**locals())) return out_file, se_file
def _cmd_miraligner(fn, out_file, species, hairpin, out): """ Run miraligner for miRNA annotation """ tool = _get_miraligner() path_db = op.dirname(op.abspath(hairpin)) cmd = "{tool} -freq -i {fn} -o {out_file} -s {species} -db {path_db} -sub 1 -trim 3 -add 3" if not file_exists(out_file): logger.info("Running miraligner with %s" % fn) do.run(cmd.format(**locals()), "miraligner with %s" % fn) shutil.move(out_file + ".mirna", out_file) return out_file
def _cmd_miraligner(fn, out_file, species, hairpin, out): """ Run miraligner for miRNA annotation """ tool = _get_miraligner() path_db = op.dirname(op.abspath(hairpin)) cmd = "{tool} -freq -i {fn} -o {out_file} -s {species} -db {path_db} -sub 1 -trim 3 -add 3" if not file_exists(out_file): logger.info("Running miraligner with %s" % fn) do.run(cmd.format(**locals()), "miraligner with %s" % fn) shutil.move(out_file + ".mirna", out_file) return out_file
def detect_regions(bam_in, bed_file, out_dir, prefix): """ Detect regions using first CoRaL module """ bed_file = _reorder_columns(bed_file) counts_reads_cmd = ("coverageBed -s -counts -b {bam_in} " "-a {bed_file} | sort -k4,4 " "> {out_dir}/loci.cov") # with tx_tmpdir() as temp_dir: with utils.chdir(out_dir): run(counts_reads_cmd.format(min_trimmed_read_len=min_trimmed_read_len, max_trimmed_read_len=max_trimmed_read_len, **locals()), "Run counts_reads") loci_file = _fix_score_column(op.join(out_dir, "loci.cov")) return loci_file
def _download_mirbase(args, version="CURRENT"): """ Download files from mirbase """ if not args.hairpin or not args.mirna: logger.info("Working with version %s" % version) hairpin_fn = op.join(op.abspath(args.out), "hairpin.fa.gz") mirna_fn = op.join(op.abspath(args.out), "miRNA.str.gz") if not file_exists(hairpin_fn): cmd_h = "wget ftp://mirbase.org/pub/mirbase/%s/hairpin.fa.gz -O %s && gunzip -f !$" % (version, hairpin_fn) do.run(cmd_h, "download hairpin") if not file_exists(mirna_fn): cmd_m = "wget ftp://mirbase.org/pub/mirbase/%s/miRNA.str.gz -O %s && gunzip -f !$" % (version, mirna_fn) do.run(cmd_m, "download mirna") else: return args.hairpin, args.mirna
def deprecated_map_to_precursors(seqs, names, loci, out_file, args): """map sequences to precursors with bowtie""" with make_temp_directory() as temp: pre_fasta = os.path.join(temp, "pre.fa") seqs_fasta = os.path.join(temp, "seqs.fa") out_sam = os.path.join(temp, "out.sam") pre_fasta = get_loci_fasta(loci, pre_fasta, args.ref) out_precursor_file = out_file.replace("tsv", "fa") seqs_fasta = get_seqs_fasta(seqs, names, seqs_fasta) if find_cmd("bowtie2-build"): cmd = "bowtie2-build -f {pre_fasta} {temp}/pre" run(cmd.format(**locals())) cmd = "bowtie2 -a --rdg 7,3 --mp 4 --end-to-end -D 20 -R 3 -N 0 -i S,1,0.8 -L 3 -f -x {temp}/pre -U {seqs_fasta} -S {out_sam}" run(cmd.format(**locals())) out_file = read_alignment(out_sam, loci, seqs, out_file) shutil.copy(pre_fasta, out_precursor_file) return out_file
def map_to_precursors(seqs, names, loci, out_file, args): """map sequences to precursors with razers3""" with make_temp_directory() as temp: pre_fasta = os.path.join(temp, "pre.fa") seqs_fasta = os.path.join(temp, "seqs.fa") out_sam = os.path.join(temp, "out.sam") pre_fasta = get_loci_fasta(loci, pre_fasta, args.ref) out_precursor_file = out_file.replace("tsv", "fa") seqs_fasta = get_seqs_fasta(seqs, names, seqs_fasta) # print(open(pre_fasta).read().split("\n")[1]) if find_cmd("razers3"): cmd = "razers3 -dr 2 -i 80 -rr 90 -f -o {out_sam} {temp}/pre.fa {seqs_fasta}" run(cmd.format(**locals())) out_file = read_alignment(out_sam, loci, seqs, out_file) shutil.copy(pre_fasta, out_precursor_file) return out_file
def deprecated_map_to_precursors(seqs, names, loci, out_file, args): """map sequences to precursors with bowtie""" with make_temp_directory() as temp: pre_fasta = os.path.join(temp, "pre.fa") seqs_fasta = os.path.join(temp, "seqs.fa") out_sam = os.path.join(temp, "out.sam") pre_fasta = get_loci_fasta(loci, pre_fasta, args.ref) out_precursor_file = out_file.replace("tsv", "fa") seqs_fasta = get_seqs_fasta(seqs, names, seqs_fasta) if find_cmd("bowtie2-build"): cmd = "bowtie2-build -f {pre_fasta} {temp}/pre" run(cmd.format(**locals())) cmd = "bowtie2 -a --rdg 7,3 --mp 4 --end-to-end -D 20 -R 3 -N 0 -i S,1,0.8 -L 3 -f -x {temp}/pre -U {seqs_fasta} -S {out_sam}" run(cmd.format(**locals())) out_file = read_alignment(out_sam, loci, seqs, out_file) shutil.copy(pre_fasta, out_precursor_file) return out_file
def map_to_precursors(seqs, names, loci, out_file, args): """map sequences to precursors with razers3""" with make_temp_directory() as temp: pre_fasta = os.path.join(temp, "pre.fa") seqs_fasta = os.path.join(temp, "seqs.fa") out_sam = os.path.join(temp, "out.sam") pre_fasta = get_loci_fasta(loci, pre_fasta, args.ref) out_precursor_file = out_file.replace("tsv", "fa") seqs_fasta = get_seqs_fasta(seqs, names, seqs_fasta) # print(open(pre_fasta).read().split("\n")[1]) if find_cmd("razers3"): cmd = "razers3 -dr 2 -i 80 -rr 90 -f -o {out_sam} {temp}/pre.fa {seqs_fasta}" run(cmd.format(**locals())) out_file = read_alignment(out_sam, loci, seqs, out_file) shutil.copy(pre_fasta, out_precursor_file) return out_file
def _download_mirbase(args, version="CURRENT"): """ Download files from mirbase """ if not args.hairpin or not args.mirna: logger.info("Working with version %s" % version) hairpin_fn = op.join(op.abspath(args.out), "hairpin.fa.gz") mirna_fn = op.join(op.abspath(args.out), "miRNA.str.gz") if not file_exists(hairpin_fn): cmd_h = "wget ftp://mirbase.org/pub/mirbase/%s/hairpin.fa.gz -O %s && gunzip -f !$" % ( version, hairpin_fn) do.run(cmd_h, "download hairpin") if not file_exists(mirna_fn): cmd_m = "wget ftp://mirbase.org/pub/mirbase/%s/miRNA.str.gz -O %s && gunzip -f !$" % ( version, mirna_fn) do.run(cmd_m, "download mirna") else: return args.hairpin, args.mirna
def _get_miraligner(): opts = "-Xms750m -Xmx4g" try: tool = "miraligner" ret = os.system(tool) if ret != 0: raise SystemExit("%s not installed." % tool) except SystemExit: tool = None pass if not tool: if not utils.file_exists(op.abspath("miraligner.jar")): url = "https://raw.githubusercontent.com/lpantano/seqbuster/miraligner/modules/miraligner/miraligner.jar" cmd = ["wget", "-O miraligner.jar", "--no-check-certificate", url] do.run(" ".join(cmd), "Download miraligner.") tool = "java -jar {opts} %s" % op.abspath("miraligner.jar") else: tool = "%s {opts}" % tool return tool.format(**locals())
def get_fasta(bed_file, ref, out_fa): """Run bedtools to get fasta from bed file""" cmd = "bedtools getfasta -s -fi {ref} -bed {bed_file} -fo {out_fa}" run(cmd.format(**locals()))
def get_fasta(bed_file, ref, out_fa): """Run bedtools to get fasta from bed file""" cmd = "bedtools getfasta -s -fi {ref} -bed {bed_file} -fo {out_fa}" run(cmd.format(**locals()))
def create_features(bam_in, loci_file, reference, out_dir): """ Use feature extraction module from CoRaL """ lenvec_plus = op.join(out_dir, 'genomic_lenvec.plus') lenvec_minus = op.join(out_dir, 'genomic_lenvec.minus') compute_genomic_cmd = ("compute_genomic_lenvectors " "{bam_in} {lenvec_plus} " "{lenvec_minus} " "{min_len} " "{max_len} ") index_genomic_cmd = ("index_genomic_lenvectors " "{lenvec} ") genomic_lenvec = op.join(out_dir, 'genomic_lenvec') feat_len_file = op.join(out_dir, 'feat_lengths.txt') compute_locus_cmd = ("compute_locus_lenvectors " "{loci_file} " "{genomic_lenvec} " "{min_len} " "{max_len} " "> {feat_len_file}") cov_S_file = op.join(out_dir, 'loci.cov_anti') coverage_anti_cmd = ("coverageBed -S -counts -b " "{bam_in} -a {loci_file} " "> {cov_S_file}") feat_posentropy = op.join(out_dir, 'feat_posentropy.txt') entropy_cmd = ("compute_locus_entropy.rb " "{counts_reads} " "> {feat_posentropy}") with utils.chdir(out_dir): run(compute_genomic_cmd.format(min_len=min_trimmed_read_len, max_len=max_trimmed_read_len, **locals()), "Run compute_genomic") run(index_genomic_cmd.format(lenvec=lenvec_plus), "Run index in plus") run(index_genomic_cmd.format(lenvec=lenvec_minus), "Run index in minus") run(compute_locus_cmd.format(min_len=min_trimmed_read_len, max_len=max_trimmed_read_len, **locals()), "Run compute locus") run(coverage_anti_cmd.format(**locals()), "Run coverage antisense") feat_antisense = _order_antisense_column(cov_S_file, min_trimmed_read_len) counts_reads = _reads_per_position(bam_in, loci_file, out_dir) run(entropy_cmd.format(**locals()), "Run entropy") rnafold = calculate_structure(loci_file, reference)