def runControlCPC(infile, outfile): # farm.py is called from within cpc.sh assert IOTools.which( "farm.py"), "farm.py needs to be in $PATH for cpc to run" # Default cpc parameters don't work with later versions of blast E.info("Running cpc with blast version:%s" % IOTools.which("blastx")) result_evidence = P.snip(outfile, ".result") + ".evidence" working_dir = "lncRNA_control/cpc" statement = ("%(pipeline_scriptsdir)s/cpc.sh" " %(infile)s" " %(outfile)s" " %(working_dir)s" " %(result_evidence)s") P.run()
def check_executables(filenames): """check for the presence/absence of executables""" missing = [] for filename in filenames: if not IOTools.which(filename): missing.append(filename) if missing: raise ValueError("missing executables: %s" % ",".join(missing))
def main(argv=None): """script main. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-o", "--output-format", dest="output_format", type="choice", choices=("bedgraph", "wiggle", "bigbed", "bigwig", "bed"), help="output format [default=%default]") parser.add_option("-s", "--shift-size", dest="shift", type="int", help="shift reads by a certain amount (ChIP-Seq) " "[%default]") parser.add_option("-e", "--extend", dest="extend", type="int", help="extend reads by a certain amount " "(ChIP-Seq) [%default]") parser.add_option("-p", "--wiggle-span", dest="span", type="int", help="span of a window in wiggle tracks " "[%default]") parser.add_option("-m", "--merge-pairs", dest="merge_pairs", action="store_true", help="merge paired-ended reads into a single " "bed interval [default=%default].") parser.add_option("--scale-base", dest="scale_base", type="float", help="number of reads/pairs to scale bigwig file to. " "The default is to scale to 1M reads " "[default=%default]") parser.add_option("--scale-method", dest="scale_method", type="choice", choices=( "none", "reads", ), help="scale bigwig output. 'reads' will normalize by " "the total number reads in the bam file that are used " "to construct the bigwig file. If --merge-pairs is used " "the number of pairs output will be used for " "normalization. 'none' will not scale the bigwig file" "[default=%default]") parser.add_option("--max-insert-size", dest="max_insert_size", type="int", help="only merge if insert size less that " "# bases. 0 turns of this filter " "[default=%default].") parser.add_option("--min-insert-size", dest="min_insert_size", type="int", help="only merge paired-end reads if they are " "at least # bases apart. " "0 turns of this filter. [default=%default]") parser.set_defaults( samfile=None, output_format="wiggle", shift=0, extend=0, span=1, merge_pairs=None, min_insert_size=0, max_insert_size=0, scale_method='none', scale_base=1000000, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) if len(args) >= 1: options.samfile = args[0] if len(args) == 2: options.output_filename_pattern = args[1] if not options.samfile: raise ValueError("please provide a bam file") # Read BAM file using Pysam samfile = pysam.AlignmentFile(options.samfile, "rb") # Create temporary files / folders tmpdir = tempfile.mkdtemp() E.debug("temporary files are in %s" % tmpdir) tmpfile_wig = os.path.join(tmpdir, "wig") tmpfile_sizes = os.path.join(tmpdir, "sizes") # Create dictionary of contig sizes contig_sizes = dict(list(zip(samfile.references, samfile.lengths))) # write contig sizes outfile_size = IOTools.open_file(tmpfile_sizes, "w") for contig, size in sorted(contig_sizes.items()): outfile_size.write("%s\t%s\n" % (contig, size)) outfile_size.close() # Shift and extend only available for bigwig format if options.shift or options.extend: if options.output_format != "bigwig": raise ValueError( "shift and extend only available for bigwig output") # Output filename required for bigwig / bigbed computation if options.output_format == "bigwig": if not options.output_filename_pattern: raise ValueError( "please specify an output file for bigwig computation.") # Define executable to use for binary conversion if options.output_format == "bigwig": executable_name = "wigToBigWig" else: raise ValueError("unknown output format `%s`" % options.output_format) # check required executable file is in the path executable = IOTools.which(executable_name) if not executable: raise OSError("could not find %s in path." % executable_name) # Open outout file outfile = IOTools.open_file(tmpfile_wig, "w") E.info("starting output to %s" % tmpfile_wig) else: outfile = IOTools.open_file(tmpfile_wig, "w") E.info("starting output to stdout") # Set up output write functions if options.output_format in ("wiggle", "bigwig"): # wiggle is one-based, so add 1, also step-size is 1, so need # to output all bases if options.span == 1: outf = lambda outfile, contig, start, end, val: \ outfile.write( "".join(["%i\t%i\n" % (x, val) for x in range(start + 1, end + 1)])) else: outf = SpanWriter(options.span) elif options.output_format == "bedgraph": # bed is 0-based, open-closed outf = lambda outfile, contig, start, end, val: \ outfile.write("%s\t%i\t%i\t%i\n" % (contig, start, end, val)) # initialise counters ninput, nskipped, ncontigs = 0, 0, 0 # set output file name output_filename_pattern = options.output_filename_pattern if output_filename_pattern: output_filename = os.path.abspath(output_filename_pattern) # shift and extend or merge pairs. Output temporay bed file if options.shift > 0 or options.extend > 0 or options.merge_pairs: # Workflow 1: convert to bed intervals and use bedtools # genomecov to build a coverage file. # Convert to bigwig with UCSC tools bedGraph2BigWig if options.merge_pairs: # merge pairs using bam2bed E.info("merging pairs to temporary file") counter = _bam2bed.merge_pairs( samfile, outfile, min_insert_size=options.min_insert_size, max_insert_size=options.max_insert_size, bed_format=3) E.info("merging results: {}".format(counter)) if counter.output == 0: raise ValueError("no pairs output after merging") else: # create bed file with shifted/extended tags shift, extend = options.shift, options.extend shift_extend = shift + extend counter = E.Counter() for contig in samfile.references: E.debug("output for %s" % contig) lcontig = contig_sizes[contig] for read in samfile.fetch(contig): pos = read.pos if read.is_reverse: start = max(0, read.pos + read.alen - shift_extend) else: start = max(0, read.pos + shift) # intervals extending beyond contig are removed if start >= lcontig: continue end = min(lcontig, start + extend) outfile.write("%s\t%i\t%i\n" % (contig, start, end)) counter.output += 1 outfile.close() if options.scale_method == "reads": scale_factor = float(options.scale_base) / counter.output E.info("scaling: method=%s scale_quantity=%i scale_factor=%f" % (options.scale_method, counter.output, scale_factor)) scale = "-scale %f" % scale_factor else: scale = "" # Convert bed file to coverage file (bedgraph) tmpfile_bed = os.path.join(tmpdir, "bed") E.info("computing coverage") # calculate coverage - format is bedgraph statement = """bedtools genomecov -bg -i %(tmpfile_wig)s %(scale)s -g %(tmpfile_sizes)s > %(tmpfile_bed)s""" % locals() E.run(statement) # Convert bedgraph to bigwig E.info("converting to bigwig") tmpfile_sorted = os.path.join(tmpdir, "sorted") statement = ("sort -k 1,1 -k2,2n %(tmpfile_bed)s > %(tmpfile_sorted)s;" "bedGraphToBigWig %(tmpfile_sorted)s %(tmpfile_sizes)s " "%(output_filename_pattern)s" % locals()) E.run(statement) else: # Workflow 2: use pysam column iterator to build a # wig file. Then convert to bigwig of bedgraph file # with UCSC tools. def column_iter(iterator): start = None end = 0 n = None for t in iterator: if t.pos - end > 1 or n != t.n: if start is not None: yield start, end, n start = t.pos end = t.pos n = t.n end = t.pos yield start, end, n if options.scale_method != "none": raise NotImplementedError( "scaling not implemented for pileup method") # Bedgraph track definition if options.output_format == "bedgraph": outfile.write("track type=bedGraph\n") for contig in samfile.references: # if contig != "chrX": continue E.debug("output for %s" % contig) lcontig = contig_sizes[contig] # Write wiggle header if options.output_format in ("wiggle", "bigwig"): outfile.write("variableStep chrom=%s span=%i\n" % (contig, options.span)) # Generate pileup per contig using pysam and iterate over columns for start, end, val in column_iter(samfile.pileup(contig)): # patch: there was a problem with bam files and reads # overextending at the end. These are usually Ns, but # need to check as otherwise wigToBigWig fails. if lcontig <= end: E.warn("read extending beyond contig: %s: %i > %i" % (contig, end, lcontig)) end = lcontig if start >= end: continue if val > 0: outf(outfile, contig, start, end, val) ncontigs += 1 # Close output file if type(outf) == type(SpanWriter): outf.flush(outfile) else: outfile.flush() E.info("finished output") # Report counters E.info("ninput=%i, ncontigs=%i, nskipped=%i" % (ninput, ncontigs, nskipped)) # Convert to binary formats if options.output_format == "bigwig": outfile.close() E.info("starting %s conversion" % executable) try: retcode = subprocess.call(" ".join( (executable, tmpfile_wig, tmpfile_sizes, output_filename_pattern)), shell=True) if retcode != 0: E.warn("%s terminated with signal: %i" % (executable, -retcode)) return -retcode except OSError as msg: E.warn("Error while executing bigwig: %s" % msg) return 1 E.info("finished bigwig conversion") else: with open(tmpfile_wig) as inf: sys.stdout.write(inf.read()) # Cleanup temp files shutil.rmtree(tmpdir) E.stop()
def getRunStatement(self, infile, outfile, controlfile): """ Generate a specific run statement for each peakcaller class """ # select location of the spp script to run if self.PARAMS_PEAKCALLER["spp_options_idr_script"] == "default": executable = IOTools.which("run_spp.R") elif self.PARAMS_PEAKCALLER["spp_options_idr_script"] == "nodups": executable = IOTools.which("run_spp_nodups.R") else: executable = self.PARAMS_PEAKCALLER["spp_options_idr_script"] try: os.path.exists(executable) except: raise IOError("SPP script not found: %s" % executable) # select the threshold for lax peak calling if self.PARAMS_PEAKCALLER["spp_options_npeaks"]: if self.PARAMS_PEAKCALLER["spp_options_fdr"]: raise Exception("Value specified for both SPP options" " -npeaks and -fdr please select one or" " other option, but not both") else: threshold = "-npeaks=" + \ str(self.PARAMS_PEAKCALLER["spp_options_npeaks"]) elif self.PARAMS_PEAKCALLER["spp_options_fdr"]: threshold = "-fdr=" + \ str(self.PARAMS_PEAKCALLER["spp_options_fdr"]) else: raise Exception("Must specify a value for either" " spp_options_npeaks or spp_options_fdr," " but not both") # build run statement for spp. # -savn is output.npeak.file (passed as NULL, # means filename based on infile) # -out is output.result.file # -odir defaults to os.path.dirname( infile ) # -savn is save narrowpeak file # -savr is save regionpeak file # (run_spp.R script throws an error if region peak is not output). statement = [("Rscript %(executable)s" " -c=%(infile)s" " -i=%(controlfile)s" " %(threshold)s" " -savn" " -savr")] # add additional options statement.append(self.PARAMS_PEAKCALLER["spp_options_parameters"]) # specify outfile # MM: this was hard-coded to a non-existent directory # changed to stats directory statement.append(" -rf" " -out=./stats/phantomPeakStatsReps.tab" " >& %(outfile)s") statement = (" ".join(statement) % locals()) return statement
def main(argv=sys.argv): parser = E.OptionParser( version= "%prog version: $Id: psl2wiggle.py 2834 2009-11-24 16:11:23Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("-b", "--output-filename-pattern", dest="output_filename", type="string", help="filename for output [default=%default]") parser.add_option("-o", "--output-format", dest="output_format", type="choice", choices=("bedgraph", "wiggle", "bigbed", "bigwig"), help="output format [default=%default]") parser.set_defaults(genome_file=None, typecode=numpy.int16, output_filename=None, output_format="wiggle", test=None) (options, args) = E.start(parser, add_pipe_options=True) typecode = options.typecode if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) counts = {} contig_sizes = fasta.getContigSizes(with_synonyms=False) E.info("allocating memory for %i contigs and %i bytes" % (len(contig_sizes), sum(contig_sizes.values()) * typecode().itemsize)) for contig, size in list(contig_sizes.items()): E.debug("allocating %s: %i bases" % (contig, size)) counts[contig] = numpy.zeros(size, typecode) E.info("allocated memory for %i contigs" % len(fasta)) else: fasta = None contig_sizes = {} if options.output_format in ("bigwig", "bigbed"): if not options.genome_file: raise ValueError( "please supply genome file for bigwig/bigbed computation.") if not options.output_filename: raise ValueError( "please output file for bigwig/bigbed computation.") if options.output_format == "bigwig": executable_name = "wigToBigWig" elif options.output_format == "bigbed": executable_name = "bedToBigBed" else: raise ValueError("unknown output format `%s`" % options.output_format) executable = IOTools.which(executable_name) if not executable: raise OSError("could not find %s in path." % executable_name) tmpdir = tempfile.mkdtemp() E.debug("temporary files are in %s" % tmpdir) tmpfile_wig = os.path.join(tmpdir, "wig") tmpfile_sizes = os.path.join(tmpdir, "sizes") # write contig sizes outfile_size = IOTools.open_file(tmpfile_sizes, "w") for contig, size in list(contig_sizes.items()): outfile_size.write("%s\t%s\n" % (contig, size)) outfile_size.close() outfile = IOTools.open_file(tmpfile_wig, "w") else: outfile = options.stdout iterator = Blat.BlatIterator(sys.stdin) ninput, ncontigs, nskipped = 0, 0, 0 E.info("started counting") while 1: if options.test and ninput >= options.test: break match = next(iterator) if match is None: break ninput += 1 contig = match.mSbjctId for start, length in zip(match.mSbjctBlockStarts, match.mBlockSizes): counts[contig][start:start + length] += 1 E.info("finished counting") if options.output_format in ("wig", "bigwig"): E.info("starting wig output") for contig, vals in list(counts.items()): E.debug("output for %s" % contig) for val, iter in itertools.groupby(enumerate(vals), lambda x: x[1]): l = list(iter) start, end = l[0][0], l[-1][0] val = vals[start] if val > 0: outfile.write("variableStep chrom=%s span=%i\n" % (contig, end - start + 1)) outfile.write("%i\t%i\n" % (start, val)) ncontigs += 1 elif options.output_format in ("bedgraph", "bigbed"): E.info("starting bedgraph output") for contig, vals in list(counts.items()): E.debug("output for %s" % contig) for val, iter in itertools.groupby(enumerate(vals), lambda x: x[1]): l = list(iter) start, end = l[0][0], l[-1][0] val = vals[start] if val > 0: outfile.write("%s\t%i\t%i\t%i\n" % (contig, start, end + 1, val)) ncontigs += 1 E.info("finished output") if options.output_format in ("bigwig", "bigbed"): outfile.close() E.info("starting bigwig conversion") try: retcode = subprocess.call(" ".join( (executable, tmpfile_wig, tmpfile_sizes, os.path.abspath(options.output_filename)), ), shell=True) if retcode < 0: warn("wigToBigWig terminated with signal: %i" % -retcode) return -retcode except OSError as msg: warn("Error while executing bigwig: %s" % e) return 1 shutil.rmtree(tmpdir) E.info("finished bigwig conversion") E.info("ninput=%i, ncontigs=%i, nskipped=%i\n" % (ninput, ncontigs, nskipped)) E.stop()
def isInstalled(self): path = IOTools.which(self.tool_definition['executable']) if path is None: return False return True
def run_report(clean=True, with_pipeline_status=True, pipeline_status_format="svg"): '''run CGATreport. This will also run ruffus to create an svg image of the pipeline status unless *with_pipeline_status* is set to False. The image will be saved into the export directory. ''' params = P.get_params() if with_pipeline_status: targetdir = params["exportdir"] if not os.path.exists(targetdir): os.mkdir(targetdir) ruffus.pipeline_printout_graph( os.path.join( targetdir, "pipeline.%s" % pipeline_status_format), pipeline_status_format, ["full"], checksum_level=params["ruffus_checksums_level"] ) dirname, basename = os.path.split(P.get_caller().__file__) report_engine = params.get("report_engine", "cgatreport") assert report_engine in ('sphinxreport', 'cgatreport') docdir = os.path.join(dirname, "pipeline_docs", IOTools.snip(basename, ".py")) themedir = os.path.join(dirname, "pipeline_docs", "themes") relpath = os.path.relpath(docdir) trackerdir = os.path.join(docdir, "trackers") # use a fake X display in order to avoid windows popping up # from R plots. xvfb_command = IOTools.which("xvfb-run") # permit multiple servers using -d option if xvfb_command: xvfb_command += " -d " else: xvfb_command = "" # if there is no DISPLAY variable set, xvfb runs, but # exits with error when killing process. Thus, ignore return # value. # print os.getenv("DISPLAY"), "command=", xvfb_command if not os.getenv("DISPLAY"): erase_return = "|| true" else: erase_return = "" if os.path.exists("conf.py"): conf_dir = os.path.abspath(".") else: conf_dir = os.path.join(os.path.dirname(__file__), "configuration") # in the current version, xvfb always returns with an error, thus # ignore these. erase_return = "|| true" if clean: clean = "rm -rf report _cache _static;" else: clean = "" # with sphinx >1.3.1 the PYTHONPATH needs to be set explicitely as # the virtual environment seems to be stripped. It is thus set to # the contents of the current sys.path syspath = ":".join(sys.path) statement = ''' %(clean)s (export SPHINX_DOCSDIR=%(docdir)s; export SPHINX_THEMEDIR=%(themedir)s; export PYTHONPATH=%(syspath)s; %(xvfb_command)s %(report_engine)s-build --num-jobs=%(report_threads)s sphinx-build -b html -d %(report_doctrees)s -c %(conf_dir)s -j %(report_threads)s %(docdir)s %(report_html)s >& report.log %(erase_return)s ) ''' P.run(statement) E.info('the report is available at %s' % os.path.abspath( os.path.join(params['report_html'], "contents.html")))