def check_clip_utils(logger, required_utils=["cutadapt", "fastx_collapser"]): """ Check that necessary utilities are available. """ logger.info("Checking that utilities required for CLIP are available..") for program in required_utils: program_path = utils.which(program) if program_path is None: logger.critical("Could not access: %s" % (program)) logger.critical("Make %s avaialble and try again." % (program)) sys.exit(1) logger.info("Found CLIP utilities.")
def check_clip_utils(logger, required_utils=["cutadapt", "fastx_collapser"]): """ Check that necessary utilities are available. """ logger.info("Checking that utilities required for CLIP are available..") for program in required_utils: program_path = utils.which(program) if program_path is None: logger.critical("Could not access: %s" %(program)) logger.critical("Make %s avaialble and try again." %(program)) sys.exit(1) logger.info("Found CLIP utilities.")
def trim_clip_adaptors(fastq_filename, adaptors_filename, output_dir, logger, min_read_len=5): """ Trim CLIP adaptors using 'cutadapt'. """ logger.info("Trimming CLIP adaptors from: %s" %(fastq_filename)) cutadapt_path = utils.which("cutadapt") if cutadapt_path is None: logger.critical("Could not find \'cutadapt\' on the path. " \ "Please install \'cutadapt\' or make the installed " \ "version available on path.") output_basename = \ utils.trim_fastq_ext(os.path.basename(fastq_filename)) output_filename = os.path.join(output_dir, "%s_trimmed.fastq.gz" \ %(output_basename)) if os.path.isfile(output_filename): logger.info("SKIPPING: %s already exists!" \ %(output_filename)) return output_filename logger.info(" - Outputting trimmed sequences to: %s" \ %(output_filename)) # Load adaptors to pass to 'cutadapt' if not os.path.isfile(adaptors_filename): logger.critical("Could not find adaptors file %s" \ %(adaptors_filename)) sys.exit(1) adaptors_in = open(adaptors_filename, "r") # Substitute newlines with spaces adaptors = adaptors_in.read().strip().replace("\n", " ") adaptors_in.close() cutadapt_cmd = "%s %s %s -o %s -m %d -q 3 > %s.log" %(cutadapt_path, adaptors, fastq_filename, output_filename, min_read_len, output_filename) logger.info("Executing: %s" %(cutadapt_cmd)) t1 = time.time() os.system(cutadapt_cmd) t2 = time.time() logger.info("Trimming took %.2f mins." %((t2 - t1)/60.)) return output_filename
def multi_tagBam(bam_filename, intervals_files, intervals_labels, output_filename, logger): """ Call tagBam mapping BAM file to multiple bed/gff files. Takes: - bam_filename: The BAM file path - intervals_files: List of string paths for the interval files (BED or GFF file format) - intervals_labels: Labels for each of the files in 'intervals_files' - output_filename: BAM filename to use as output - logger: a logger to log messages to """ num_interval_files = len(intervals_files) logger.info("Running tagBam against %d interval files.." \ %(num_interval_files)) tagBam = utils.which("tagBam") if tagBam is None: logger.critical("tagBam not found.") return None if os.path.isfile(output_filename): logger.info("Found %s, skipping." % (output_filename)) return output_filename t1 = time.time() args = { "tagBam": tagBam, "bam_filename": bam_filename, "intervals_files": " ".join(intervals_files), "intervals_labels": " ".join(intervals_labels), "output_filename": output_filename } tagBam_cmd = \ "%(tagBam)s -i %(bam_filename)s -files %(intervals_files)s " \ "-labels %(intervals_labels)s -intervals -f 1 > %(output_filename)s" \ %(args) logger.info("Executing: %s" % (tagBam_cmd)) ret_val = os.system(tagBam_cmd) t2 = time.time() logger.info("tagBam took %.2f minutes." % ((t2 - t1) / 60.)) if ret_val != 0: logger.critical("tagBam command failed.") return None return output_filename
def trim_clip_adaptors(fastq_filename, adaptors_filename, output_dir, logger, min_read_len=5): """ Trim CLIP adaptors using 'cutadapt'. """ logger.info("Trimming CLIP adaptors from: %s" % (fastq_filename)) cutadapt_path = utils.which("cutadapt") if cutadapt_path is None: logger.critical("Could not find \'cutadapt\' on the path. " \ "Please install \'cutadapt\' or make the installed " \ "version available on path.") output_basename = \ utils.trim_fastq_ext(os.path.basename(fastq_filename)) output_filename = os.path.join(output_dir, "%s_trimmed.fastq.gz" \ %(output_basename)) if os.path.isfile(output_filename): logger.info("SKIPPING: %s already exists!" \ %(output_filename)) return output_filename logger.info(" - Outputting trimmed sequences to: %s" \ %(output_filename)) # Load adaptors to pass to 'cutadapt' if not os.path.isfile(adaptors_filename): logger.critical("Could not find adaptors file %s" \ %(adaptors_filename)) sys.exit(1) adaptors_in = open(adaptors_filename, "r") # Substitute newlines with spaces adaptors = adaptors_in.read().strip().replace("\n", " ") adaptors_in.close() cutadapt_cmd = "%s %s %s -o %s -m %d -q 3 > %s.log" % ( cutadapt_path, adaptors, fastq_filename, output_filename, min_read_len, output_filename) logger.info("Executing: %s" % (cutadapt_cmd)) t1 = time.time() os.system(cutadapt_cmd) t2 = time.time() logger.info("Trimming took %.2f mins." % ((t2 - t1) / 60.)) return output_filename
def multi_tagBam(bam_filename, intervals_files, intervals_labels, output_filename, logger): """ Call tagBam mapping BAM file to multiple bed/gff files. Takes: - bam_filename: The BAM file path - intervals_files: List of string paths for the interval files (BED or GFF file format) - intervals_labels: Labels for each of the files in 'intervals_files' - output_filename: BAM filename to use as output - logger: a logger to log messages to """ num_interval_files = len(intervals_files) logger.info("Running tagBam against %d interval files.." % (num_interval_files)) tagBam = utils.which("tagBam") if tagBam is None: logger.critical("tagBam not found.") return None if os.path.isfile(output_filename): logger.info("Found %s, skipping." % (output_filename)) return output_filename t1 = time.time() args = { "tagBam": tagBam, "bam_filename": bam_filename, "intervals_files": " ".join(intervals_files), "intervals_labels": " ".join(intervals_labels), "output_filename": output_filename, } tagBam_cmd = ( "%(tagBam)s -i %(bam_filename)s -files %(intervals_files)s " "-labels %(intervals_labels)s -intervals -f 1 > %(output_filename)s" % (args) ) logger.info("Executing: %s" % (tagBam_cmd)) ret_val = os.system(tagBam_cmd) t2 = time.time() logger.info("tagBam took %.2f minutes." % ((t2 - t1) / 60.0)) if ret_val != 0: logger.critical("tagBam command failed.") return None return output_filename
def check_requirements(): print "Checking that all required programs are available..." # Utilities that need to be on path for pipeline to run REQUIRED_PROGRAMS = [ # UCSC utils "genePredToGtf", # Tophat/Bowtie "bowtie-build", "tophat", # Bedtools "intersectBed", "subtractBed", "sortBed", "mergeBed", "tagBam", # Related utils "gtf2gff3.pl", # Unix utils "gunzip", "wget", "cat", "zcat", "cut" ] found_all = True for program in REQUIRED_PROGRAMS: if utils.which(program) is None: print "WARNING: Cannot find required program \'%s\' " \ "on your path. Please install it or add it. " \ "to your path if already installed." %(program) if program == "genePredToGtf": genePredToGtf_msg() elif program == "gtf2gff3.pl": gtf2gff3_msg() print " - Proceeding anyway..." found_all = False if found_all: print "Found all required programs." else: # Do not proceed if programs not found. sys.exit(1)
def run_meme(logger, input_fasta_fname, output_dir, meme_params=None): """ Run MEME against an input FASTA file. """ # Get default parameters for MEME params = get_meme_default_params() # Set output directory for MEME params.update({"-o": output_dir}) if meme_params is not None: # Update parameters with user-given parameters, if any params.update(meme_params) # Check if MEME is available meme_path = utils.which("meme") if meme_path is None: logger.critical("Error: Cannot find or execute \'meme\' program.") sys.exit(1) params_str = " ".join(["%s %s" %(p, params[p]) for p in params]) fasta_basename = \ os.path.basename(input_fasta_fname).rsplit(".", 1)[0] meme_output_fname = \ os.path.join(output_dir, "%s.meme" %(fasta_basename)) if os.path.isfile(meme_output_fname): logger.info("Found MEME file %s, skipping..." \ %(meme_output_fname)) return meme_output_fname meme_cmd = "%s %s %s > %s" %(meme_path, params_str, input_fasta_fname, meme_output_fname) logger.info("Calling MEME: ") logger.info("Executing: %s" %(meme_cmd)) t1 = time.time() ret_val = os.system(meme_cmd) if ret_val != 0: logger.critical("Error: MEME call failed.") sys.exit(1) t2 = time.time() logger.info("MEME completed in %.2f minutes" %((t2 - t1)/60.)) return meme_output_fname
def fastx_collapse_fastq(fastq_filename, output_dir, logger): """ FASTX collapse FASTQ. Return """ fastx_collapser = utils.which("fastx_collapser") if fastx_collapser is None: logger.critical("Could not find fastx_collapser.") return None if not os.path.isfile(fastq_filename): logger.critical("Could not find input fastq %s" \ %(fastq_filename)) return None output_basename = \ utils.trim_fastq_ext(os.path.basename(fastq_filename)) collapsed_seq_filename = os.path.join(output_dir, "%s.collapsed.fasta.gz" \ %(output_basename)) if os.path.isfile(collapsed_seq_filename): logger.info("%s exists, skipping collapsing step." \ %(collapsed_seq_filename)) return collapsed_seq_filename cat_fastq_cmd = "cat" # Handle gzipped input since fastx_collapser does not accept # gzipped FASTQ files if fastq_filename.endswith(".gz"): cat_fastq_cmd = "zcat" cat_fastq_cmd += " %s" %(fastq_filename) # Use -Q 33 flag to signal Illumina quality scores to # FASTX-Toolkit fastx_collapser_cmd = "%s | %s -Q 33 | gzip -c - > %s" \ %(cat_fastq_cmd, fastx_collapser, collapsed_seq_filename) logger.info("Executing: %s" %(fastx_collapser_cmd)) ret_val = os.system(fastx_collapser_cmd) if ret_val != 0: logger.critical("Error: fastx_collapser command failed.") return None return collapsed_seq_filename
def check_requirements(): print "Checking that all required programs are available..." # Utilities that need to be on path for pipeline to run REQUIRED_PROGRAMS = [# UCSC utils "genePredToGtf", # Tophat/Bowtie "bowtie-build", "tophat", # Bedtools "intersectBed", "subtractBed", "sortBed", "mergeBed", "tagBam", # Related utils "gtf2gff3.pl", # Unix utils "gunzip", "wget", "cat", "zcat", "cut"] found_all = True for program in REQUIRED_PROGRAMS: if utils.which(program) is None: print "WARNING: Cannot find required program \'%s\' " \ "on your path. Please install it or add it. " \ "to your path if already installed." %(program) if program == "genePredToGtf": genePredToGtf_msg() elif program == "gtf2gff3.pl": gtf2gff3_msg() print " - Proceeding anyway..." found_all = False if found_all: print "Found all required programs." else: # Do not proceed if programs not found. sys.exit(1)
## ## Utilities for working with jellyfish ## import os import sys import time import glob import rnaseqlib import rnaseqlib.utils as utils import rnaseqlib.fastx_utils as fastx_utils from collections import defaultdict jf_path = utils.which("jellyfish") def jf_merge(fname_base): """ Merge all the jellyfish output files in the directory that they're in. Returns merged filename. """ output_dir = os.path.dirname(fname_base) fname_pat = "%s_*" % (fname_base) jf_files = glob.glob(fname_pat) merged_fname = None if len(jf_files) == 0: raise Exception, "Cannot merge, no jf files with " \ "basename %s" %(fname_base)
import subprocess import numpy as np from collections import defaultdict import rnaseqlib import rnaseqlib.utils as utils import rnaseqlib.coords_utils as coords_utils import pybedtools ## ## Paths to bedtools programs ## intersectBed_path = utils.which("intersectBed") mergeBed_path = utils.which("mergeBed") tagBam_path = utils.which("tagBam") coverageBed_path = utils.which("coverageBed") fastaFromBed_path = utils.which("fastaFromBed") def bed_to_gff(bedtool_input): """ Convert BedTool corresponding to BED file into a GFF file. """ for bed_entry in bedtool_input: # chrom, start, end, name, score, strand chrom = bed_entry.fields[0] start = bed_entry.fields[1]
## ## Utilities for working with jellyfish ## import os import sys import time import glob import rnaseqlib import rnaseqlib.utils as utils import rnaseqlib.fastx_utils as fastx_utils from collections import defaultdict jf_path = utils.which("jellyfish") def jf_merge(fname_base): """ Merge all the jellyfish output files in the directory that they're in. Returns merged filename. """ output_dir = os.path.dirname(fname_base) fname_pat = "%s_*" %(fname_base) jf_files = glob.glob(fname_pat) merged_fname = None if len(jf_files) == 0: raise Exception, "Cannot merge, no jf files with " \ "basename %s" %(fname_base) if len(jf_files) == 1:
## ## Utilities for running Homer ## import os import sys import time import rnaseqlib import rnaseqlib.utils as utils homer_path = utils.which("findMotifsGenome.pl") def run_homer(logger, bed_fname, genome, output_dir, params): """ Run Homer against an input BED file. findMotifsGenome.pl <pos file> <genome> <output directory> """ if homer_path is None: logger.critical("Error: Cannot find or execute Homer program.") sys.exit(1) params_str = " ".join(["%s %s" %(p, params[p]) for p in params]) utils.make_dir(output_dir) # If there's a Homer results directory in the target # directory, then don't rerun Homer if os.path.isdir(os.path.join(output_dir, "homerResults")): logger.info("Found Homer results, skipping..") return output_dir homer_cmd = "%s %s %s %s %s" %(homer_path,
## ## Utilities for running Homer ## import os import sys import time import rnaseqlib import rnaseqlib.utils as utils homer_path = utils.which("findMotifsGenome.pl") def run_homer(logger, bed_fname, genome, output_dir, params): """ Run Homer against an input BED file. findMotifsGenome.pl <pos file> <genome> <output directory> """ if homer_path is None: logger.critical("Error: Cannot find or execute Homer program.") sys.exit(1) params_str = " ".join(["%s %s" % (p, params[p]) for p in params]) utils.make_dir(output_dir) # If there's a Homer results directory in the target # directory, then don't rerun Homer if os.path.isdir(os.path.join(output_dir, "homerResults")): logger.info("Found Homer results, skipping..") return output_dir homer_cmd = "%s %s %s %s %s" % (homer_path, bed_fname, genome, output_dir, params_str)