Пример #1
0
def check_clip_utils(logger, required_utils=["cutadapt", "fastx_collapser"]):
    """
    Check that necessary utilities are available.
    """
    logger.info("Checking that utilities required for CLIP are available..")
    for program in required_utils:
        program_path = utils.which(program)
        if program_path is None:
            logger.critical("Could not access: %s" % (program))
            logger.critical("Make %s avaialble and try again." % (program))
            sys.exit(1)
    logger.info("Found CLIP utilities.")
Пример #2
0
def check_clip_utils(logger,
                     required_utils=["cutadapt",
                                     "fastx_collapser"]):
    """
    Check that necessary utilities are available.
    """
    logger.info("Checking that utilities required for CLIP are available..")
    for program in required_utils:
        program_path = utils.which(program)
        if program_path is None:
            logger.critical("Could not access: %s" %(program))
            logger.critical("Make %s avaialble and try again." %(program))
            sys.exit(1)
    logger.info("Found CLIP utilities.")
Пример #3
0
def trim_clip_adaptors(fastq_filename,
                       adaptors_filename,
                       output_dir,
                       logger,
                       min_read_len=5):
    """
    Trim CLIP adaptors using 'cutadapt'.
    """
    logger.info("Trimming CLIP adaptors from: %s" %(fastq_filename))
    cutadapt_path = utils.which("cutadapt")
    if cutadapt_path is None:
        logger.critical("Could not find \'cutadapt\' on the path. " \
                        "Please install \'cutadapt\' or make the installed " \
                        "version available on path.")
    output_basename = \
        utils.trim_fastq_ext(os.path.basename(fastq_filename))
    output_filename = os.path.join(output_dir,
                                   "%s_trimmed.fastq.gz" \
                                   %(output_basename))
    if os.path.isfile(output_filename):
        logger.info("SKIPPING: %s already exists!" \
                    %(output_filename))
        return output_filename
    logger.info("  - Outputting trimmed sequences to: %s" \
                %(output_filename))
    # Load adaptors to pass to 'cutadapt'
    if not os.path.isfile(adaptors_filename):
        logger.critical("Could not find adaptors file %s" \
                        %(adaptors_filename))
        sys.exit(1)
    adaptors_in = open(adaptors_filename, "r")
    # Substitute newlines with spaces
    adaptors = adaptors_in.read().strip().replace("\n", " ")
    adaptors_in.close()
    cutadapt_cmd = "%s %s %s -o %s -m %d -q 3 > %s.log" %(cutadapt_path,
                                                          adaptors,
                                                          fastq_filename,
                                                          output_filename,
                                                          min_read_len,
                                                          output_filename)
    logger.info("Executing: %s" %(cutadapt_cmd))
    t1 = time.time()
    os.system(cutadapt_cmd)
    t2 = time.time()
    logger.info("Trimming took %.2f mins." %((t2 - t1)/60.))
    return output_filename
Пример #4
0
def multi_tagBam(bam_filename, intervals_files, intervals_labels,
                 output_filename, logger):
    """
    Call tagBam mapping BAM file to multiple bed/gff files.

    Takes:

    - bam_filename: The BAM file path
    - intervals_files: List of string paths for the interval files
      (BED or GFF file format)
    - intervals_labels: Labels for each of the files in 'intervals_files'
    - output_filename: BAM filename to use as output
    - logger: a logger to log messages to
    """
    num_interval_files = len(intervals_files)
    logger.info("Running tagBam against %d interval files.." \
                %(num_interval_files))
    tagBam = utils.which("tagBam")
    if tagBam is None:
        logger.critical("tagBam not found.")
        return None
    if os.path.isfile(output_filename):
        logger.info("Found %s, skipping." % (output_filename))
        return output_filename
    t1 = time.time()
    args = {
        "tagBam": tagBam,
        "bam_filename": bam_filename,
        "intervals_files": " ".join(intervals_files),
        "intervals_labels": " ".join(intervals_labels),
        "output_filename": output_filename
    }
    tagBam_cmd = \
      "%(tagBam)s -i %(bam_filename)s -files %(intervals_files)s " \
      "-labels %(intervals_labels)s -intervals -f 1 > %(output_filename)s" \
      %(args)
    logger.info("Executing: %s" % (tagBam_cmd))
    ret_val = os.system(tagBam_cmd)
    t2 = time.time()
    logger.info("tagBam took %.2f minutes." % ((t2 - t1) / 60.))
    if ret_val != 0:
        logger.critical("tagBam command failed.")
        return None
    return output_filename
Пример #5
0
def trim_clip_adaptors(fastq_filename,
                       adaptors_filename,
                       output_dir,
                       logger,
                       min_read_len=5):
    """
    Trim CLIP adaptors using 'cutadapt'.
    """
    logger.info("Trimming CLIP adaptors from: %s" % (fastq_filename))
    cutadapt_path = utils.which("cutadapt")
    if cutadapt_path is None:
        logger.critical("Could not find \'cutadapt\' on the path. " \
                        "Please install \'cutadapt\' or make the installed " \
                        "version available on path.")
    output_basename = \
        utils.trim_fastq_ext(os.path.basename(fastq_filename))
    output_filename = os.path.join(output_dir,
                                   "%s_trimmed.fastq.gz" \
                                   %(output_basename))
    if os.path.isfile(output_filename):
        logger.info("SKIPPING: %s already exists!" \
                    %(output_filename))
        return output_filename
    logger.info("  - Outputting trimmed sequences to: %s" \
                %(output_filename))
    # Load adaptors to pass to 'cutadapt'
    if not os.path.isfile(adaptors_filename):
        logger.critical("Could not find adaptors file %s" \
                        %(adaptors_filename))
        sys.exit(1)
    adaptors_in = open(adaptors_filename, "r")
    # Substitute newlines with spaces
    adaptors = adaptors_in.read().strip().replace("\n", " ")
    adaptors_in.close()
    cutadapt_cmd = "%s %s %s -o %s -m %d -q 3 > %s.log" % (
        cutadapt_path, adaptors, fastq_filename, output_filename, min_read_len,
        output_filename)
    logger.info("Executing: %s" % (cutadapt_cmd))
    t1 = time.time()
    os.system(cutadapt_cmd)
    t2 = time.time()
    logger.info("Trimming took %.2f mins." % ((t2 - t1) / 60.))
    return output_filename
Пример #6
0
def multi_tagBam(bam_filename, intervals_files, intervals_labels, output_filename, logger):
    """
    Call tagBam mapping BAM file to multiple bed/gff files.

    Takes:

    - bam_filename: The BAM file path
    - intervals_files: List of string paths for the interval files
      (BED or GFF file format)
    - intervals_labels: Labels for each of the files in 'intervals_files'
    - output_filename: BAM filename to use as output
    - logger: a logger to log messages to
    """
    num_interval_files = len(intervals_files)
    logger.info("Running tagBam against %d interval files.." % (num_interval_files))
    tagBam = utils.which("tagBam")
    if tagBam is None:
        logger.critical("tagBam not found.")
        return None
    if os.path.isfile(output_filename):
        logger.info("Found %s, skipping." % (output_filename))
        return output_filename
    t1 = time.time()
    args = {
        "tagBam": tagBam,
        "bam_filename": bam_filename,
        "intervals_files": " ".join(intervals_files),
        "intervals_labels": " ".join(intervals_labels),
        "output_filename": output_filename,
    }
    tagBam_cmd = (
        "%(tagBam)s -i %(bam_filename)s -files %(intervals_files)s "
        "-labels %(intervals_labels)s -intervals -f 1 > %(output_filename)s" % (args)
    )
    logger.info("Executing: %s" % (tagBam_cmd))
    ret_val = os.system(tagBam_cmd)
    t2 = time.time()
    logger.info("tagBam took %.2f minutes." % ((t2 - t1) / 60.0))
    if ret_val != 0:
        logger.critical("tagBam command failed.")
        return None
    return output_filename
Пример #7
0
def check_requirements():
    print "Checking that all required programs are available..."
    # Utilities that need to be on path for pipeline to run
    REQUIRED_PROGRAMS = [  # UCSC utils
        "genePredToGtf",
        # Tophat/Bowtie
        "bowtie-build",
        "tophat",
        # Bedtools
        "intersectBed",
        "subtractBed",
        "sortBed",
        "mergeBed",
        "tagBam",
        # Related utils
        "gtf2gff3.pl",
        # Unix utils
        "gunzip",
        "wget",
        "cat",
        "zcat",
        "cut"
    ]
    found_all = True
    for program in REQUIRED_PROGRAMS:
        if utils.which(program) is None:
            print "WARNING: Cannot find required program \'%s\' " \
                  "on your path.  Please install it or add it. " \
                  "to your path if already installed." %(program)
            if program == "genePredToGtf":
                genePredToGtf_msg()
            elif program == "gtf2gff3.pl":
                gtf2gff3_msg()
            print "  - Proceeding anyway..."
            found_all = False
    if found_all:
        print "Found all required programs."
    else:
        # Do not proceed if programs not found.
        sys.exit(1)
Пример #8
0
def run_meme(logger, input_fasta_fname, output_dir,
             meme_params=None):
    """
    Run MEME against an input FASTA file.
    """
    # Get default parameters for MEME
    params = get_meme_default_params()
    # Set output directory for MEME
    params.update({"-o": output_dir})
    if meme_params is not None:
        # Update parameters with user-given parameters, if any
        params.update(meme_params)
    # Check if MEME is available
    meme_path = utils.which("meme")
    if meme_path is None:
        logger.critical("Error: Cannot find or execute \'meme\' program.")
        sys.exit(1)
    params_str =  " ".join(["%s %s" %(p, params[p]) for p in params])
    fasta_basename = \
        os.path.basename(input_fasta_fname).rsplit(".", 1)[0]
    meme_output_fname = \
        os.path.join(output_dir, "%s.meme" %(fasta_basename))
    if os.path.isfile(meme_output_fname):
        logger.info("Found MEME file %s, skipping..." \
                    %(meme_output_fname))
        return meme_output_fname
    meme_cmd = "%s %s %s > %s" %(meme_path,
                                 params_str,
                                 input_fasta_fname,
                                 meme_output_fname)
    logger.info("Calling MEME: ")
    logger.info("Executing: %s" %(meme_cmd))
    t1 = time.time()
    ret_val = os.system(meme_cmd)
    if ret_val != 0:
        logger.critical("Error: MEME call failed.")
        sys.exit(1)
    t2 = time.time()
    logger.info("MEME completed in %.2f minutes" %((t2 - t1)/60.))
    return meme_output_fname
Пример #9
0
def fastx_collapse_fastq(fastq_filename, output_dir, logger):
    """
    FASTX collapse FASTQ. Return 
    """
    fastx_collapser = utils.which("fastx_collapser")
    if fastx_collapser is None:
        logger.critical("Could not find fastx_collapser.")
        return None
    if not os.path.isfile(fastq_filename):
        logger.critical("Could not find input fastq %s" \
                        %(fastq_filename))
        return None
    output_basename = \
        utils.trim_fastq_ext(os.path.basename(fastq_filename))
    collapsed_seq_filename = os.path.join(output_dir,
                                          "%s.collapsed.fasta.gz" \
                                          %(output_basename))
    if os.path.isfile(collapsed_seq_filename):
        logger.info("%s exists, skipping collapsing step." \
                    %(collapsed_seq_filename))
        return collapsed_seq_filename
    cat_fastq_cmd = "cat"
    # Handle gzipped input since fastx_collapser does not accept
    # gzipped FASTQ files
    if fastq_filename.endswith(".gz"):
        cat_fastq_cmd = "zcat"
    cat_fastq_cmd += " %s" %(fastq_filename)
    # Use -Q 33 flag to signal Illumina quality scores to
    # FASTX-Toolkit
    fastx_collapser_cmd = "%s | %s -Q 33 | gzip -c - > %s" \
        %(cat_fastq_cmd,
          fastx_collapser,
          collapsed_seq_filename)
    logger.info("Executing: %s" %(fastx_collapser_cmd))
    ret_val = os.system(fastx_collapser_cmd)
    if ret_val != 0:
        logger.critical("Error: fastx_collapser command failed.")
        return None
    return collapsed_seq_filename
Пример #10
0
def check_requirements():
    print "Checking that all required programs are available..."
    # Utilities that need to be on path for pipeline to run
    REQUIRED_PROGRAMS = [# UCSC utils
                         "genePredToGtf",
                         # Tophat/Bowtie
                         "bowtie-build",
                         "tophat",
                         # Bedtools
                         "intersectBed",
                         "subtractBed",
                         "sortBed",
                         "mergeBed",
                         "tagBam",
                         # Related utils
                         "gtf2gff3.pl",
                         # Unix utils
                         "gunzip",
                         "wget",
                         "cat",
                         "zcat",
                         "cut"]
    found_all = True
    for program in REQUIRED_PROGRAMS:
        if utils.which(program) is None:
            print "WARNING: Cannot find required program \'%s\' " \
                  "on your path.  Please install it or add it. " \
                  "to your path if already installed." %(program)
            if program == "genePredToGtf":
                genePredToGtf_msg()
            elif program == "gtf2gff3.pl":
                gtf2gff3_msg()
            print "  - Proceeding anyway..."
            found_all = False
    if found_all:
        print "Found all required programs."
    else:
        # Do not proceed if programs not found.
        sys.exit(1)
Пример #11
0
##
## Utilities for working with jellyfish
##
import os
import sys
import time
import glob

import rnaseqlib
import rnaseqlib.utils as utils
import rnaseqlib.fastx_utils as fastx_utils

from collections import defaultdict

jf_path = utils.which("jellyfish")


def jf_merge(fname_base):
    """
    Merge all the jellyfish output files in the directory
    that they're in.

    Returns merged filename.
    """
    output_dir = os.path.dirname(fname_base)
    fname_pat = "%s_*" % (fname_base)
    jf_files = glob.glob(fname_pat)
    merged_fname = None
    if len(jf_files) == 0:
        raise Exception, "Cannot merge, no jf files with " \
                         "basename %s" %(fname_base)
Пример #12
0
import subprocess

import numpy as np

from collections import defaultdict

import rnaseqlib
import rnaseqlib.utils as utils
import rnaseqlib.coords_utils as coords_utils

import pybedtools

##
## Paths to bedtools programs
##
intersectBed_path = utils.which("intersectBed")
mergeBed_path = utils.which("mergeBed")
tagBam_path = utils.which("tagBam")
coverageBed_path = utils.which("coverageBed")
fastaFromBed_path = utils.which("fastaFromBed")


def bed_to_gff(bedtool_input):
    """
    Convert BedTool corresponding to BED
    file into a GFF file.
    """
    for bed_entry in bedtool_input:
        # chrom, start, end, name, score, strand
        chrom = bed_entry.fields[0]
        start = bed_entry.fields[1]
Пример #13
0
##
## Utilities for working with jellyfish
##
import os
import sys
import time
import glob

import rnaseqlib
import rnaseqlib.utils as utils
import rnaseqlib.fastx_utils as fastx_utils

from collections import defaultdict

jf_path = utils.which("jellyfish")
    
def jf_merge(fname_base):
    """
    Merge all the jellyfish output files in the directory
    that they're in.

    Returns merged filename.
    """
    output_dir = os.path.dirname(fname_base)
    fname_pat = "%s_*" %(fname_base)
    jf_files = glob.glob(fname_pat)
    merged_fname = None
    if len(jf_files) == 0:
        raise Exception, "Cannot merge, no jf files with " \
                         "basename %s" %(fname_base)
    if len(jf_files) == 1:
Пример #14
0
import subprocess

import numpy as np

from collections import defaultdict

import rnaseqlib
import rnaseqlib.utils as utils
import rnaseqlib.coords_utils as coords_utils

import pybedtools

##
## Paths to bedtools programs
##
intersectBed_path = utils.which("intersectBed")
mergeBed_path = utils.which("mergeBed")
tagBam_path = utils.which("tagBam")
coverageBed_path = utils.which("coverageBed")
fastaFromBed_path = utils.which("fastaFromBed")


def bed_to_gff(bedtool_input):
    """
    Convert BedTool corresponding to BED
    file into a GFF file.
    """
    for bed_entry in bedtool_input:
        # chrom, start, end, name, score, strand
        chrom = bed_entry.fields[0]
        start = bed_entry.fields[1]
Пример #15
0
##
## Utilities for running Homer
##
import os
import sys
import time

import rnaseqlib
import rnaseqlib.utils as utils

homer_path = utils.which("findMotifsGenome.pl")


def run_homer(logger, bed_fname, genome, output_dir,
              params):
    """
    Run Homer against an input BED file.

    findMotifsGenome.pl <pos file> <genome> <output directory> 
    """
    if homer_path is None:
        logger.critical("Error: Cannot find or execute Homer program.")
        sys.exit(1)
    params_str =  " ".join(["%s %s" %(p, params[p]) for p in params])
    utils.make_dir(output_dir)
    # If there's a Homer results directory in the target
    # directory, then don't rerun Homer
    if os.path.isdir(os.path.join(output_dir, "homerResults")):
        logger.info("Found Homer results, skipping..")
        return output_dir
    homer_cmd = "%s %s %s %s %s" %(homer_path,
Пример #16
0
##
## Utilities for running Homer
##
import os
import sys
import time

import rnaseqlib
import rnaseqlib.utils as utils

homer_path = utils.which("findMotifsGenome.pl")


def run_homer(logger, bed_fname, genome, output_dir, params):
    """
    Run Homer against an input BED file.

    findMotifsGenome.pl <pos file> <genome> <output directory> 
    """
    if homer_path is None:
        logger.critical("Error: Cannot find or execute Homer program.")
        sys.exit(1)
    params_str = " ".join(["%s %s" % (p, params[p]) for p in params])
    utils.make_dir(output_dir)
    # If there's a Homer results directory in the target
    # directory, then don't rerun Homer
    if os.path.isdir(os.path.join(output_dir, "homerResults")):
        logger.info("Found Homer results, skipping..")
        return output_dir
    homer_cmd = "%s %s %s %s %s" % (homer_path, bed_fname, genome, output_dir,
                                    params_str)