示例#1
0
def runControlCPC(infile, outfile):
    # farm.py is called from within cpc.sh
    assert IOTools.which(
        "farm.py"), "farm.py needs to be in $PATH for cpc to run"
    # Default cpc parameters don't work with later versions of blast
    E.info("Running cpc with blast version:%s" % IOTools.which("blastx"))

    result_evidence = P.snip(outfile, ".result") + ".evidence"
    working_dir = "lncRNA_control/cpc"
    statement = ("%(pipeline_scriptsdir)s/cpc.sh"
                 " %(infile)s"
                 " %(outfile)s"
                 " %(working_dir)s"
                 " %(result_evidence)s")
    P.run()
示例#2
0
def check_executables(filenames):
    """check for the presence/absence of executables"""

    missing = []

    for filename in filenames:
        if not IOTools.which(filename):
            missing.append(filename)

    if missing:
        raise ValueError("missing executables: %s" % ",".join(missing))
def main(argv=None):
    """script main.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-o",
                      "--output-format",
                      dest="output_format",
                      type="choice",
                      choices=("bedgraph", "wiggle", "bigbed", "bigwig",
                               "bed"),
                      help="output format [default=%default]")

    parser.add_option("-s",
                      "--shift-size",
                      dest="shift",
                      type="int",
                      help="shift reads by a certain amount (ChIP-Seq) "
                      "[%default]")

    parser.add_option("-e",
                      "--extend",
                      dest="extend",
                      type="int",
                      help="extend reads by a certain amount "
                      "(ChIP-Seq) [%default]")

    parser.add_option("-p",
                      "--wiggle-span",
                      dest="span",
                      type="int",
                      help="span of a window in wiggle tracks "
                      "[%default]")

    parser.add_option("-m",
                      "--merge-pairs",
                      dest="merge_pairs",
                      action="store_true",
                      help="merge paired-ended reads into a single "
                      "bed interval [default=%default].")

    parser.add_option("--scale-base",
                      dest="scale_base",
                      type="float",
                      help="number of reads/pairs to scale bigwig file to. "
                      "The default is to scale to 1M reads "
                      "[default=%default]")

    parser.add_option("--scale-method",
                      dest="scale_method",
                      type="choice",
                      choices=(
                          "none",
                          "reads",
                      ),
                      help="scale bigwig output. 'reads' will normalize by "
                      "the total number reads in the bam file that are used "
                      "to construct the bigwig file. If --merge-pairs is used "
                      "the number of pairs output will be used for "
                      "normalization. 'none' will not scale the bigwig file"
                      "[default=%default]")

    parser.add_option("--max-insert-size",
                      dest="max_insert_size",
                      type="int",
                      help="only merge if insert size less that "
                      "# bases. 0 turns of this filter "
                      "[default=%default].")

    parser.add_option("--min-insert-size",
                      dest="min_insert_size",
                      type="int",
                      help="only merge paired-end reads if they are "
                      "at least # bases apart. "
                      "0 turns of this filter. [default=%default]")

    parser.set_defaults(
        samfile=None,
        output_format="wiggle",
        shift=0,
        extend=0,
        span=1,
        merge_pairs=None,
        min_insert_size=0,
        max_insert_size=0,
        scale_method='none',
        scale_base=1000000,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    if len(args) >= 1:
        options.samfile = args[0]
    if len(args) == 2:
        options.output_filename_pattern = args[1]
    if not options.samfile:
        raise ValueError("please provide a bam file")

    # Read BAM file using Pysam
    samfile = pysam.AlignmentFile(options.samfile, "rb")

    # Create temporary files / folders
    tmpdir = tempfile.mkdtemp()
    E.debug("temporary files are in %s" % tmpdir)
    tmpfile_wig = os.path.join(tmpdir, "wig")
    tmpfile_sizes = os.path.join(tmpdir, "sizes")

    # Create dictionary of contig sizes
    contig_sizes = dict(list(zip(samfile.references, samfile.lengths)))
    # write contig sizes
    outfile_size = IOTools.open_file(tmpfile_sizes, "w")
    for contig, size in sorted(contig_sizes.items()):
        outfile_size.write("%s\t%s\n" % (contig, size))
    outfile_size.close()

    # Shift and extend only available for bigwig format
    if options.shift or options.extend:
        if options.output_format != "bigwig":
            raise ValueError(
                "shift and extend only available for bigwig output")

    # Output filename required for bigwig / bigbed computation
    if options.output_format == "bigwig":
        if not options.output_filename_pattern:
            raise ValueError(
                "please specify an output file for bigwig computation.")

        # Define executable to use for binary conversion
        if options.output_format == "bigwig":
            executable_name = "wigToBigWig"
        else:
            raise ValueError("unknown output format `%s`" %
                             options.output_format)

        # check required executable file is in the path
        executable = IOTools.which(executable_name)
        if not executable:
            raise OSError("could not find %s in path." % executable_name)

        # Open outout file
        outfile = IOTools.open_file(tmpfile_wig, "w")
        E.info("starting output to %s" % tmpfile_wig)
    else:
        outfile = IOTools.open_file(tmpfile_wig, "w")
        E.info("starting output to stdout")

    # Set up output write functions
    if options.output_format in ("wiggle", "bigwig"):
        # wiggle is one-based, so add 1, also step-size is 1, so need
        # to output all bases
        if options.span == 1:
            outf = lambda outfile, contig, start, end, val: \
                outfile.write(
                    "".join(["%i\t%i\n" % (x, val)
                             for x in range(start + 1, end + 1)]))
        else:
            outf = SpanWriter(options.span)
    elif options.output_format == "bedgraph":
        # bed is 0-based, open-closed
        outf = lambda outfile, contig, start, end, val: \
            outfile.write("%s\t%i\t%i\t%i\n" % (contig, start, end, val))

    # initialise counters
    ninput, nskipped, ncontigs = 0, 0, 0

    # set output file name
    output_filename_pattern = options.output_filename_pattern
    if output_filename_pattern:
        output_filename = os.path.abspath(output_filename_pattern)

    # shift and extend or merge pairs. Output temporay bed file
    if options.shift > 0 or options.extend > 0 or options.merge_pairs:
        # Workflow 1: convert to bed intervals and use bedtools
        # genomecov to build a coverage file.
        # Convert to bigwig with UCSC tools bedGraph2BigWig

        if options.merge_pairs:
            # merge pairs using bam2bed
            E.info("merging pairs to temporary file")
            counter = _bam2bed.merge_pairs(
                samfile,
                outfile,
                min_insert_size=options.min_insert_size,
                max_insert_size=options.max_insert_size,
                bed_format=3)
            E.info("merging results: {}".format(counter))
            if counter.output == 0:
                raise ValueError("no pairs output after merging")
        else:
            # create bed file with shifted/extended tags
            shift, extend = options.shift, options.extend
            shift_extend = shift + extend
            counter = E.Counter()

            for contig in samfile.references:
                E.debug("output for %s" % contig)
                lcontig = contig_sizes[contig]

                for read in samfile.fetch(contig):
                    pos = read.pos
                    if read.is_reverse:
                        start = max(0, read.pos + read.alen - shift_extend)
                    else:
                        start = max(0, read.pos + shift)

                    # intervals extending beyond contig are removed
                    if start >= lcontig:
                        continue

                    end = min(lcontig, start + extend)
                    outfile.write("%s\t%i\t%i\n" % (contig, start, end))
                    counter.output += 1

        outfile.close()

        if options.scale_method == "reads":
            scale_factor = float(options.scale_base) / counter.output

            E.info("scaling: method=%s scale_quantity=%i scale_factor=%f" %
                   (options.scale_method, counter.output, scale_factor))
            scale = "-scale %f" % scale_factor
        else:
            scale = ""

        # Convert bed file to coverage file (bedgraph)
        tmpfile_bed = os.path.join(tmpdir, "bed")
        E.info("computing coverage")
        # calculate coverage - format is bedgraph
        statement = """bedtools genomecov -bg -i %(tmpfile_wig)s %(scale)s
        -g %(tmpfile_sizes)s > %(tmpfile_bed)s""" % locals()
        E.run(statement)

        # Convert bedgraph to bigwig
        E.info("converting to bigwig")
        tmpfile_sorted = os.path.join(tmpdir, "sorted")
        statement = ("sort -k 1,1 -k2,2n %(tmpfile_bed)s > %(tmpfile_sorted)s;"
                     "bedGraphToBigWig %(tmpfile_sorted)s %(tmpfile_sizes)s "
                     "%(output_filename_pattern)s" % locals())
        E.run(statement)

    else:

        # Workflow 2: use pysam column iterator to build a
        # wig file. Then convert to bigwig of bedgraph file
        # with UCSC tools.
        def column_iter(iterator):
            start = None
            end = 0
            n = None
            for t in iterator:
                if t.pos - end > 1 or n != t.n:
                    if start is not None:
                        yield start, end, n
                    start = t.pos
                    end = t.pos
                    n = t.n
                end = t.pos
            yield start, end, n

        if options.scale_method != "none":
            raise NotImplementedError(
                "scaling not implemented for pileup method")

        # Bedgraph track definition
        if options.output_format == "bedgraph":
            outfile.write("track type=bedGraph\n")

        for contig in samfile.references:
            # if contig != "chrX": continue
            E.debug("output for %s" % contig)
            lcontig = contig_sizes[contig]

            # Write wiggle header
            if options.output_format in ("wiggle", "bigwig"):
                outfile.write("variableStep chrom=%s span=%i\n" %
                              (contig, options.span))

            # Generate pileup per contig using pysam and iterate over columns
            for start, end, val in column_iter(samfile.pileup(contig)):
                # patch: there was a problem with bam files and reads
                # overextending at the end. These are usually Ns, but
                # need to check as otherwise wigToBigWig fails.
                if lcontig <= end:
                    E.warn("read extending beyond contig: %s: %i > %i" %
                           (contig, end, lcontig))
                    end = lcontig
                    if start >= end:
                        continue

                if val > 0:
                    outf(outfile, contig, start, end, val)
            ncontigs += 1

        # Close output file
        if type(outf) == type(SpanWriter):
            outf.flush(outfile)
        else:
            outfile.flush()

        E.info("finished output")

        # Report counters
        E.info("ninput=%i, ncontigs=%i, nskipped=%i" %
               (ninput, ncontigs, nskipped))

        # Convert to binary formats
        if options.output_format == "bigwig":
            outfile.close()

            E.info("starting %s conversion" % executable)
            try:
                retcode = subprocess.call(" ".join(
                    (executable, tmpfile_wig, tmpfile_sizes,
                     output_filename_pattern)),
                                          shell=True)
                if retcode != 0:
                    E.warn("%s terminated with signal: %i" %
                           (executable, -retcode))
                    return -retcode
            except OSError as msg:
                E.warn("Error while executing bigwig: %s" % msg)
                return 1
            E.info("finished bigwig conversion")
        else:
            with open(tmpfile_wig) as inf:
                sys.stdout.write(inf.read())

    # Cleanup temp files
    shutil.rmtree(tmpdir)

    E.stop()
示例#4
0
    def getRunStatement(self, infile, outfile, controlfile):
        """
        Generate a specific run statement for each peakcaller class
        """
        # select location of the spp script to run
        if self.PARAMS_PEAKCALLER["spp_options_idr_script"] == "default":
            executable = IOTools.which("run_spp.R")
        elif self.PARAMS_PEAKCALLER["spp_options_idr_script"] == "nodups":
            executable = IOTools.which("run_spp_nodups.R")
        else:
            executable = self.PARAMS_PEAKCALLER["spp_options_idr_script"]
            try:
                os.path.exists(executable)
            except:
                raise IOError("SPP script not found: %s" % executable)

        # select the threshold for lax peak calling
        if self.PARAMS_PEAKCALLER["spp_options_npeaks"]:
            if self.PARAMS_PEAKCALLER["spp_options_fdr"]:
                raise Exception("Value specified for both SPP options"
                                " -npeaks and -fdr please select one or"
                                " other option, but not both")
            else:
                threshold = "-npeaks=" + \
                    str(self.PARAMS_PEAKCALLER["spp_options_npeaks"])
        elif self.PARAMS_PEAKCALLER["spp_options_fdr"]:
            threshold = "-fdr=" + \
                str(self.PARAMS_PEAKCALLER["spp_options_fdr"])
        else:
            raise Exception("Must specify a value for either"
                            " spp_options_npeaks or spp_options_fdr,"
                            " but not both")

        # build run statement for spp.
        # -savn is output.npeak.file (passed as NULL,
        #                             means filename based on infile)
        # -out is output.result.file
        # -odir defaults to os.path.dirname( infile )
        # -savn is save narrowpeak file
        # -savr is save regionpeak file
        #  (run_spp.R script throws an error if region peak is not output).
        statement = [("Rscript %(executable)s"
                      " -c=%(infile)s"
                      " -i=%(controlfile)s"
                      " %(threshold)s"
                      " -savn"
                      " -savr")]

        # add additional options
        statement.append(self.PARAMS_PEAKCALLER["spp_options_parameters"])

        # specify outfile
        # MM: this was hard-coded to a non-existent directory
        # changed to stats directory
        statement.append(" -rf"
                         " -out=./stats/phantomPeakStatsReps.tab"
                         " >& %(outfile)s")

        statement = (" ".join(statement) % locals())

        return statement
def main(argv=sys.argv):

    parser = E.OptionParser(
        version=
        "%prog version: $Id: psl2wiggle.py 2834 2009-11-24 16:11:23Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default].")

    parser.add_option("-b",
                      "--output-filename-pattern",
                      dest="output_filename",
                      type="string",
                      help="filename for output [default=%default]")

    parser.add_option("-o",
                      "--output-format",
                      dest="output_format",
                      type="choice",
                      choices=("bedgraph", "wiggle", "bigbed", "bigwig"),
                      help="output format [default=%default]")

    parser.set_defaults(genome_file=None,
                        typecode=numpy.int16,
                        output_filename=None,
                        output_format="wiggle",
                        test=None)

    (options, args) = E.start(parser, add_pipe_options=True)

    typecode = options.typecode

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        counts = {}
        contig_sizes = fasta.getContigSizes(with_synonyms=False)
        E.info("allocating memory for %i contigs and %i bytes" %
               (len(contig_sizes),
                sum(contig_sizes.values()) * typecode().itemsize))
        for contig, size in list(contig_sizes.items()):
            E.debug("allocating %s: %i bases" % (contig, size))
            counts[contig] = numpy.zeros(size, typecode)

        E.info("allocated memory for %i contigs" % len(fasta))

    else:
        fasta = None
        contig_sizes = {}

    if options.output_format in ("bigwig", "bigbed"):

        if not options.genome_file:
            raise ValueError(
                "please supply genome file for bigwig/bigbed computation.")

        if not options.output_filename:
            raise ValueError(
                "please output file for bigwig/bigbed computation.")

        if options.output_format == "bigwig":
            executable_name = "wigToBigWig"
        elif options.output_format == "bigbed":
            executable_name = "bedToBigBed"
        else:
            raise ValueError("unknown output format `%s`" %
                             options.output_format)

        executable = IOTools.which(executable_name)

        if not executable:
            raise OSError("could not find %s in path." % executable_name)

        tmpdir = tempfile.mkdtemp()
        E.debug("temporary files are in %s" % tmpdir)

        tmpfile_wig = os.path.join(tmpdir, "wig")
        tmpfile_sizes = os.path.join(tmpdir, "sizes")

        # write contig sizes
        outfile_size = IOTools.open_file(tmpfile_sizes, "w")
        for contig, size in list(contig_sizes.items()):
            outfile_size.write("%s\t%s\n" % (contig, size))
        outfile_size.close()

        outfile = IOTools.open_file(tmpfile_wig, "w")

    else:
        outfile = options.stdout

    iterator = Blat.BlatIterator(sys.stdin)

    ninput, ncontigs, nskipped = 0, 0, 0

    E.info("started counting")

    while 1:

        if options.test and ninput >= options.test:
            break

        match = next(iterator)

        if match is None:
            break

        ninput += 1

        contig = match.mSbjctId

        for start, length in zip(match.mSbjctBlockStarts, match.mBlockSizes):
            counts[contig][start:start + length] += 1

    E.info("finished counting")

    if options.output_format in ("wig", "bigwig"):
        E.info("starting wig output")

        for contig, vals in list(counts.items()):

            E.debug("output for %s" % contig)
            for val, iter in itertools.groupby(enumerate(vals),
                                               lambda x: x[1]):
                l = list(iter)
                start, end = l[0][0], l[-1][0]
                val = vals[start]
                if val > 0:
                    outfile.write("variableStep chrom=%s span=%i\n" %
                                  (contig, end - start + 1))
                    outfile.write("%i\t%i\n" % (start, val))

            ncontigs += 1
    elif options.output_format in ("bedgraph", "bigbed"):

        E.info("starting bedgraph output")

        for contig, vals in list(counts.items()):
            E.debug("output for %s" % contig)
            for val, iter in itertools.groupby(enumerate(vals),
                                               lambda x: x[1]):
                l = list(iter)
                start, end = l[0][0], l[-1][0]
                val = vals[start]
                if val > 0:
                    outfile.write("%s\t%i\t%i\t%i\n" %
                                  (contig, start, end + 1, val))

            ncontigs += 1

    E.info("finished output")

    if options.output_format in ("bigwig", "bigbed"):
        outfile.close()

        E.info("starting bigwig conversion")
        try:
            retcode = subprocess.call(" ".join(
                (executable, tmpfile_wig, tmpfile_sizes,
                 os.path.abspath(options.output_filename)), ),
                                      shell=True)
            if retcode < 0:
                warn("wigToBigWig terminated with signal: %i" % -retcode)
                return -retcode
        except OSError as msg:
            warn("Error while executing bigwig: %s" % e)
            return 1

        shutil.rmtree(tmpdir)

        E.info("finished bigwig conversion")

    E.info("ninput=%i, ncontigs=%i, nskipped=%i\n" %
           (ninput, ncontigs, nskipped))

    E.stop()
 def isInstalled(self):
     path = IOTools.which(self.tool_definition['executable'])
     if path is None:
         return False
     return True
示例#7
0
def run_report(clean=True,
               with_pipeline_status=True,
               pipeline_status_format="svg"):
    '''run CGATreport.

    This will also run ruffus to create an svg image of the pipeline
    status unless *with_pipeline_status* is set to False. The image
    will be saved into the export directory.

    '''

    params = P.get_params()

    if with_pipeline_status:
        targetdir = params["exportdir"]
        if not os.path.exists(targetdir):
            os.mkdir(targetdir)

        ruffus.pipeline_printout_graph(
            os.path.join(
                targetdir,
                "pipeline.%s" % pipeline_status_format),
            pipeline_status_format,
            ["full"],
            checksum_level=params["ruffus_checksums_level"]
        )

    dirname, basename = os.path.split(P.get_caller().__file__)

    report_engine = params.get("report_engine", "cgatreport")
    assert report_engine in ('sphinxreport', 'cgatreport')

    docdir = os.path.join(dirname, "pipeline_docs", IOTools.snip(basename, ".py"))
    themedir = os.path.join(dirname, "pipeline_docs", "themes")
    relpath = os.path.relpath(docdir)
    trackerdir = os.path.join(docdir, "trackers")

    # use a fake X display in order to avoid windows popping up
    # from R plots.
    xvfb_command = IOTools.which("xvfb-run")

    # permit multiple servers using -d option
    if xvfb_command:
        xvfb_command += " -d "
    else:
        xvfb_command = ""

    # if there is no DISPLAY variable set, xvfb runs, but
    # exits with error when killing process. Thus, ignore return
    # value.
    # print os.getenv("DISPLAY"), "command=", xvfb_command
    if not os.getenv("DISPLAY"):
        erase_return = "|| true"
    else:
        erase_return = ""

    if os.path.exists("conf.py"):
        conf_dir = os.path.abspath(".")
    else:
        conf_dir = os.path.join(os.path.dirname(__file__), "configuration")

    # in the current version, xvfb always returns with an error, thus
    # ignore these.
    erase_return = "|| true"

    if clean:
        clean = "rm -rf report _cache _static;"
    else:
        clean = ""

    # with sphinx >1.3.1 the PYTHONPATH needs to be set explicitely as
    # the virtual environment seems to be stripped. It is thus set to
    # the contents of the current sys.path
    syspath = ":".join(sys.path)

    statement = '''
    %(clean)s
    (export SPHINX_DOCSDIR=%(docdir)s;
    export SPHINX_THEMEDIR=%(themedir)s;
    export PYTHONPATH=%(syspath)s;
    %(xvfb_command)s
    %(report_engine)s-build
    --num-jobs=%(report_threads)s
    sphinx-build
    -b html
    -d %(report_doctrees)s
    -c %(conf_dir)s
    -j %(report_threads)s
    %(docdir)s %(report_html)s
    >& report.log %(erase_return)s )
    '''

    P.run(statement)

    E.info('the report is available at %s' % os.path.abspath(
        os.path.join(params['report_html'], "contents.html")))