def _run_on_chrom(chrom, work_bams, names, work_dir, items): """Run cn.mops on work BAMs for a specific chromosome. """ local_sitelib = utils.R_sitelib() batch = sshared.get_cur_batch(items) ext = "-%s-cnv" % batch if batch else "-cnv" out_file = os.path.join( work_dir, "%s%s-%s.bed" % (os.path.splitext(os.path.basename( work_bams[0]))[0], ext, chrom if chrom else "all")) if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: rcode = "%s-run.R" % os.path.splitext(out_file)[0] with open(rcode, "w") as out_handle: out_handle.write( _script.format(prep_str=_prep_load_script( work_bams, names, chrom, items), out_file=tx_out_file, local_sitelib=local_sitelib)) rscript = utils.Rscript_cmd() try: do.run([rscript, "--vanilla", rcode], "cn.mops CNV detection", items[0], log_error=False) except subprocess.CalledProcessError as msg: # cn.mops errors out if no CNVs found. Just write an empty file. if _allowed_cnmops_errorstates(str(msg)): with open(tx_out_file, "w") as out_handle: out_handle.write( 'track name=empty description="No CNVs found"\n') else: logger.exception() raise return [out_file]
def _setup_logging(args): # Set environment to standard to use periods for decimals and avoid localization os.environ["LC_ALL"] = "C" os.environ["LC"] = "C" os.environ["LANG"] = "C" config = None if len(args) == 1 and isinstance(args[0], (list, tuple)): args = args[0] for arg in args: if config_utils.is_nested_config_arg(arg): config = arg["config"] break elif config_utils.is_std_config_arg(arg): config = arg break elif isinstance(arg, (list, tuple)) and config_utils.is_nested_config_arg( arg[0]): config = arg[0]["config"] break if config is None: raise NotImplementedError("No config found in arguments: %s" % args[0]) handler = setup_local_logging(config, config.get("parallel", {})) try: yield config except: logger.exception("Unexpected error") raise finally: if hasattr(handler, "close"): handler.close()
def _run_bubbletree(vcf_csv, cnv_csv, data, has_normal=True): """Create R script and run on input data """ local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") base = utils.splitext_plus(vcf_csv)[0] r_file = "%s-run.R" % base bubbleplot_out = "%s-bubbleplot.pdf" % base trackplot_out = "%s-trackplot.pdf" % base calls_out = "%s-calls.rds" % base freqs_out = "%s-bubbletree_prevalence.txt" % base sample = dd.get_sample_name(data) # BubbleTree has some internal hardcoded paramters that assume a smaller # distribution of log2 scores. This is not true for tumor-only calls and # normal contamination, so we scale the calculations to actually get calls. # Need a better long term solution with flexible parameters. lrr_scale = 1.0 if has_normal else 10.0 with open(r_file, "w") as out_handle: out_handle.write(_script.format(**locals())) if not utils.file_exists(freqs_out): try: do.run([utils.Rscript_cmd(), r_file], "Assess heterogeneity with BubbleTree") except subprocess.CalledProcessError, msg: if _allowed_bubbletree_errorstates(str(msg)): with open(freqs_out, "w") as out_handle: out_handle.write('bubbletree failed:\n %s"\n' % (str(msg))) else: logger.exception() raise
def _get_machine_info(parallel, sys_config, dirs, config): """Get machine resource information from the job scheduler via either the command line or the queue. """ if parallel.get("queue") and parallel.get("scheduler"): # dictionary as switch statement; can add new scheduler implementation functions as (lowercase) keys sched_info_dict = { "slurm": _slurm_info, "torque": _torque_info, "sge": _sge_info } if parallel["scheduler"].lower() in sched_info_dict: try: return sched_info_dict[parallel["scheduler"].lower()]( parallel.get("queue", "")) except: # If something goes wrong, just hit the queue logger.exception( "Couldn't get machine information from resource query function for queue " "'{0}' on scheduler \"{1}\"; " "submitting job to queue".format(parallel.get("queue", ""), parallel["scheduler"])) else: logger.info( "Resource query function not implemented for scheduler \"{0}\"; " "submitting job to queue".format(parallel["scheduler"])) from bcbio.distributed import prun with prun.start(parallel, [[sys_config]], config, dirs) as run_parallel: return run_parallel("machine_info", [[sys_config]])
def _setup_logging(args): # Set environment to standard to use periods for decimals and avoid localization os.environ["LC_ALL"] = "C" os.environ["LC"] = "C" os.environ["LANG"] = "C" config = None if len(args) == 1 and isinstance(args[0], (list, tuple)): args = args[0] for arg in args: if config_utils.is_nested_config_arg(arg): config = arg["config"] break elif config_utils.is_std_config_arg(arg): config = arg break elif isinstance(arg, (list, tuple)) and config_utils.is_nested_config_arg(arg[0]): config = arg[0]["config"] break if config is None: raise NotImplementedError("No config found in arguments: %s" % args[0]) handler = setup_local_logging(config, config.get("parallel", {})) try: yield config except: logger.exception("Unexpected error") raise finally: if hasattr(handler, "close"): handler.close()
def run_memory_retry(cmd, descr, data=None, check=None, region=None): """Run command, retrying when detecting fail due to memory errors. This is useful for high throughput Java jobs which fail intermittently due to an inability to get system resources. """ max_runs = 5 num_runs = 0 while 1: try: run(cmd, descr, data, check, region=region, log_error=False) break except subprocess.CalledProcessError, msg: if num_runs < max_runs and ( "insufficient memory" in str(msg) or "did not provide enough memory" in str(msg) or "A fatal error has been detected" in str(msg) or "java.lang.OutOfMemoryError" in str(msg) or "Resource temporarily unavailable" in str(msg)): logger.info( "Retrying job. Memory or resource issue with run: %s" % _descr_str(descr, data, region)) time.sleep(30) num_runs += 1 else: logger.exception() raise
def _run_bubbletree(vcf_csv, cnv_csv, data, wide_lrr=False, do_plots=True, handle_failures=True): """Create R script and run on input data BubbleTree has some internal hardcoded paramters that assume a smaller distribution of log2 scores. This is not true for tumor-only calls, so if we specify wide_lrr we scale the calculations to actually get calls. Need a better long term solution with flexible parameters. """ lrr_scale = 10.0 if wide_lrr else 1.0 local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") base = utils.splitext_plus(vcf_csv)[0] r_file = "%s-run.R" % base bubbleplot_out = "%s-bubbleplot.pdf" % base trackplot_out = "%s-trackplot.pdf" % base calls_out = "%s-calls.rds" % base freqs_out = "%s-bubbletree_prevalence.txt" % base sample = dd.get_sample_name(data) do_plots = "yes" if do_plots else "no" with open(r_file, "w") as out_handle: out_handle.write(_script.format(**locals())) if not utils.file_exists(freqs_out): try: do.run([utils.Rscript_cmd(), r_file], "Assess heterogeneity with BubbleTree") except subprocess.CalledProcessError, msg: if handle_failures and _allowed_bubbletree_errorstates(str(msg)): with open(freqs_out, "w") as out_handle: out_handle.write('bubbletree failed:\n %s"\n' % (str(msg))) else: logger.exception() raise
def make_scrnaseq_object(samples): """ load the initial se.rda object using sinclecell-experiment """ local_sitelib = R_sitelib() counts_dir = os.path.dirname( dd.get_in_samples(samples, dd.get_combined_counts)) gtf_file = dd.get_in_samples(samples, dd.get_transcriptome_gtf) if not gtf_file: gtf_file = dd.get_in_samples(samples, dd.get_gtf_file) rda_file = os.path.join(counts_dir, "se.rda") if not file_exists(rda_file): with file_transaction(rda_file) as tx_out_file: rcode = "%s-run.R" % os.path.splitext(rda_file)[0] rrna_file = "%s-rrna.txt" % os.path.splitext(rda_file)[0] rrna_file = _find_rRNA_genes(gtf_file, rrna_file) with open(rcode, "w") as out_handle: out_handle.write(_script.format(**locals())) rscript = Rscript_cmd() try: # do.run([rscript, "--vanilla", rcode], # "SingleCellExperiment", # log_error=False) rda_file = rcode except subprocess.CalledProcessError as msg: logger.exception()
def _run_bubbletree(vcf_csv, cnv_csv, data, wide_lrr=False, do_plots=True, handle_failures=True): """Create R script and run on input data BubbleTree has some internal hardcoded paramters that assume a smaller distribution of log2 scores. This is not true for tumor-only calls, so if we specify wide_lrr we scale the calculations to actually get calls. Need a better long term solution with flexible parameters. """ lrr_scale = 10.0 if wide_lrr else 1.0 local_sitelib = utils.R_sitelib() base = utils.splitext_plus(vcf_csv)[0] r_file = "%s-run.R" % base bubbleplot_out = "%s-bubbleplot.pdf" % base trackplot_out = "%s-trackplot.pdf" % base calls_out = "%s-calls.rds" % base freqs_out = "%s-bubbletree_prevalence.txt" % base sample = dd.get_sample_name(data) do_plots = "yes" if do_plots else "no" with open(r_file, "w") as out_handle: out_handle.write(_script.format(**locals())) if not utils.file_exists(freqs_out): cmd = "%s && %s --no-environ %s" % (utils.get_R_exports(), utils.Rscript_cmd(), r_file) try: do.run(cmd, "Assess heterogeneity with BubbleTree") except subprocess.CalledProcessError as msg: if handle_failures and _allowed_bubbletree_errorstates(str(msg)): with open(freqs_out, "w") as out_handle: out_handle.write('bubbletree failed:\n %s"\n' % (str(msg))) else: logger.exception() raise return {"caller": "bubbletree", "report": freqs_out, "plot": {"bubble": bubbleplot_out, "track": trackplot_out}}
def _run_on_chrom(chrom, work_bams, names, work_dir, items): """Run cn.mops on work BAMs for a specific chromosome. """ local_sitelib = os.path.join( install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") out_file = os.path.join( work_dir, "%s-%s-cnv.bed" % (os.path.splitext( os.path.basename(work_bams[0]))[0], chrom if chrom else "all")) if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: rcode = "%s-run.R" % os.path.splitext(out_file)[0] with open(rcode, "w") as out_handle: out_handle.write( _script.format(prep_str=_prep_load_script( work_bams, names, chrom, items), out_file=tx_out_file, local_sitelib=local_sitelib)) rscript = config_utils.get_program("Rscript", items[0]["config"]) try: do.run([rscript, rcode], "cn.mops CNV detection", items[0], log_error=False) except subprocess.CalledProcessError, msg: # cn.mops errors out if no CNVs found. Just write an empty file. if _allowed_cnmops_errorstates(str(msg)): with open(tx_out_file, "w") as out_handle: out_handle.write( 'track name=empty description="No CNVs found"\n') else: logger.exception() raise
def _run_bubbletree(vcf_csv, cnv_csv, data): """Create R script and run on input data """ local_sitelib = os.path.join( install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") base = utils.splitext_plus(vcf_csv)[0] r_file = "%s-run.R" % base bubbleplot_out = "%s-bubbleplot.pdf" % base trackplot_out = "%s-trackplot.pdf" % base calls_out = "%s-calls.rds" % base freqs_out = "%s-bubbletree_prevalence.txt" % base sample = dd.get_sample_name(data) with open(r_file, "w") as out_handle: out_handle.write(_script.format(**locals())) if not utils.file_exists(freqs_out): try: do.run([utils.Rscript_cmd(), r_file], "Assess heterogeneity with BubbleTree") except subprocess.CalledProcessError, msg: if _allowed_bubbletree_errorstates(str(msg)): with open(freqs_out, "w") as out_handle: out_handle.write('bubbletree failed:\n %s"\n' % (str(msg))) else: logger.exception() raise
def run(cmd, descr=None, data=None, checks=None, region=None, log_error=True, log_stdout=False, env=None): """Run the provided command, logging details and checking for errors. """ if descr: descr = _descr_str(descr, data, region) logger.debug(descr) cmd_id = diagnostics.start_cmd(cmd, descr or "", data) try: logger_cl.debug(" ".join( str(x) for x in cmd) if not isinstance(cmd, basestring) else cmd) _do_run(cmd, checks, log_stdout, env=env) except: diagnostics.end_cmd(cmd_id, False) if log_error: logger.exception() raise finally: diagnostics.end_cmd(cmd_id)
def run_memory_retry(cmd, descr, data=None, check=None, region=None): """Run command, retrying when detecting fail due to memory errors. This is useful for high throughput Java jobs which fail intermittently due to an inability to get system resources. """ max_runs = 5 num_runs = 0 while 1: try: run(cmd, descr, data, check, region=region, log_error=False) break except subprocess.CalledProcessError, msg: if num_runs < max_runs and ("insufficient memory" in str(msg) or "did not provide enough memory" in str(msg) or "A fatal error has been detected" in str(msg) or "java.lang.OutOfMemoryError" in str(msg) or "Resource temporarily unavailable" in str(msg)): logger.info("Retrying job. Memory or resource issue with run: %s" % _descr_str(descr, data, region)) time.sleep(30) num_runs += 1 else: logger.exception() raise
def _setup_logging(args): config = None if len(args) == 1 and isinstance(args[0], (list, tuple)): args = args[0] for arg in args: if config_utils.is_nested_config_arg(arg): config = arg["config"] break elif config_utils.is_std_config_arg(arg): config = arg break elif isinstance(arg, (list, tuple)) and config_utils.is_nested_config_arg( arg[0]): config = arg[0]["config"] break if config is None: raise NotImplementedError("No config found in arguments: %s" % args[0]) handler = setup_local_logging(config, config.get("parallel", {})) try: yield config except: logger.exception("Unexpected error") raise finally: if hasattr(handler, "close"): handler.close()
def _setup_logging(args): config = None if len(args) == 1 and isinstance(args[0], (list, tuple)): args = args[0] for arg in args: if config_utils.is_nested_config_arg(arg): config = arg["config"] break elif config_utils.is_std_config_arg(arg): config = arg break elif isinstance(arg, (list, tuple)) and config_utils.is_nested_config_arg(arg[0]): config = arg[0]["config"] break if config is None: raise NotImplementedError("No config found in arguments: %s" % args[0]) handler = setup_local_logging(config, config.get("parallel", {})) try: yield config except: logger.exception("Unexpected error") raise finally: if hasattr(handler, "close"): handler.close()
def _run_on_chrom(chrom, work_bams, names, work_dir, items): """Run cn.mops on work BAMs for a specific chromosome. """ local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") batch = sshared.get_cur_batch(items) ext = "-%s-cnv" % batch if batch else "-cnv" out_file = os.path.join(work_dir, "%s%s-%s.bed" % (os.path.splitext(os.path.basename(work_bams[0]))[0], ext, chrom if chrom else "all")) if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: rcode = "%s-run.R" % os.path.splitext(out_file)[0] with open(rcode, "w") as out_handle: out_handle.write(_script.format(prep_str=_prep_load_script(work_bams, names, chrom, items), out_file=tx_out_file, local_sitelib=local_sitelib)) rscript = config_utils.get_program("Rscript", items[0]["config"]) try: do.run([rscript, rcode], "cn.mops CNV detection", items[0], log_error=False) except subprocess.CalledProcessError, msg: # cn.mops errors out if no CNVs found. Just write an empty file. if _allowed_cnmops_errorstates(str(msg)): with open(tx_out_file, "w") as out_handle: out_handle.write('track name=empty description="No CNVs found"\n') else: logger.exception() raise
def make_scrnaseq_object(samples): """ load the initial se.rda object using sinclecell-experiment """ local_sitelib = R_sitelib() counts_dir = os.path.dirname(dd.get_in_samples(samples, dd.get_combined_counts)) gtf_file = dd.get_in_samples(samples, dd.get_transcriptome_gtf) if not gtf_file: gtf_file = dd.get_in_samples(samples, dd.get_gtf_file) rda_file = os.path.join(counts_dir, "se.rda") if not file_exists(rda_file): with file_transaction(rda_file) as tx_out_file: rcode = "%s-run.R" % os.path.splitext(rda_file)[0] rrna_file = "%s-rrna.txt" % os.path.splitext(rda_file)[0] rrna_file = _find_rRNA_genes(gtf_file, rrna_file) with open(rcode, "w") as out_handle: out_handle.write(_script.format(**locals())) rscript = Rscript_cmd() try: # do.run([rscript, "--no-environ", rcode], # "SingleCellExperiment", # log_error=False) rda_file = rcode except subprocess.CalledProcessError as msg: logger.exception()
def gatk_indel_realignment(runner, align_bam, ref_file, intervals, region=None, out_file=None, deep_coverage=False): """Perform realignment of BAM file in specified regions """ if out_file is None: out_file = "%s-realign.bam" % os.path.splitext(align_bam)[0] if not file_exists(out_file): with curdir_tmpdir() as tmp_dir: with file_transaction(out_file) as tx_out_file: logger.info("GATK IndelRealigner: %s %s" % (os.path.basename(align_bam), region)) params = ["-T", "IndelRealigner", "-I", align_bam, "-R", ref_file, "-targetIntervals", intervals, "-o", tx_out_file, "-l", "INFO", ] if region: params += ["-L", region] if deep_coverage: params += ["--maxReadsInMemory", "300000", "--maxReadsForRealignment", str(int(5e5)), "--maxReadsForConsensuses", "500", "--maxConsensuses", "100"] try: runner.run_gatk(params, tmp_dir) except: logger.exception("Running GATK IndelRealigner failed: {} {}".format( os.path.basename(align_bam), region)) raise return out_file
def process(args): """Run the function in args.name given arguments in args.argfile. """ # Set environment to standard to use periods for decimals and avoid localization locale_to_use = utils.get_locale() os.environ["LC_ALL"] = locale_to_use os.environ["LC"] = locale_to_use os.environ["LANG"] = locale_to_use setpath.prepend_bcbiopath() try: fn = getattr(multitasks, args.name) except AttributeError: raise AttributeError( "Did not find exposed function in bcbio.distributed.multitasks named '%s'" % args.name) if args.moreargs or args.raw: fnargs = [args.argfile] + args.moreargs work_dir = None argfile = None else: with open(args.argfile) as in_handle: fnargs = yaml.safe_load(in_handle) work_dir = os.path.dirname(args.argfile) fnargs = config_utils.merge_resources(fnargs) argfile = args.outfile if args.outfile else "%s-out%s" % os.path.splitext( args.argfile) if not work_dir: work_dir = os.getcwd() if len(fnargs) > 0 and fnargs[0] == "cwl": fnargs, parallel, out_keys, input_files = _world_from_cwl( args.name, fnargs[1:], work_dir) # Can remove this awkward Docker merge when we do not need custom GATK3 installs fnargs = config_utils.merge_resources(fnargs) argfile = os.path.join(work_dir, "cwl.output.json") else: parallel, out_keys, input_files = None, {}, [] with utils.chdir(work_dir): with contextlib.closing( log.setup_local_logging(parallel={"wrapper": "runfn"})): try: out = fn(*fnargs) except: logger.exception() raise finally: # Clean up any copied and unpacked workflow inputs, avoiding extra disk usage wf_input_dir = os.path.join(work_dir, "wf-inputs") if os.path.exists(wf_input_dir) and os.path.isdir( wf_input_dir): shutil.rmtree(wf_input_dir) if argfile: try: _write_out_argfile(argfile, out, fnargs, parallel, out_keys, input_files, work_dir) except: logger.exception() raise
def sort(in_bam, config, order="coordinate"): """Sort a BAM file, skipping if already present. """ assert is_bam(in_bam), "%s in not a BAM file" % in_bam if bam_already_sorted(in_bam, config, order): return in_bam sort_stem = _get_sort_stem(in_bam, order) sort_file = sort_stem + ".bam" if not utils.file_exists(sort_file): sambamba = _get_sambamba(config) samtools = config_utils.get_program("samtools", config) cores = config["algorithm"].get("num_cores", 1) with file_transaction(config, sort_file) as tx_sort_file: tx_sort_stem = os.path.splitext(tx_sort_file)[0] tx_dir = utils.safe_makedir(os.path.dirname(tx_sort_file)) order_flag = "-n" if order == "queryname" else "" resources = config_utils.get_resources("samtools", config) mem = resources.get("memory", "2G") samtools_cmd = ("{samtools} sort -@ {cores} -m {mem} {order_flag} " "{in_bam} {tx_sort_stem}") if sambamba: if tz.get_in(["resources", "sambamba"], config): sm_resources = config_utils.get_resources( "sambamba", config) mem = sm_resources.get("memory", "2G") # sambamba uses total memory, not memory per core mem = config_utils.adjust_memory(mem, cores, "increase").upper() # Use samtools compatible natural sorting # https://github.com/lomereiter/sambamba/issues/132 order_flag = "--natural-sort" if order == "queryname" else "" cmd = ("{sambamba} sort -t {cores} -m {mem} {order_flag} " "-o {tx_sort_file} --tmpdir={tx_dir} {in_bam}") else: cmd = samtools_cmd # sambamba has intermittent multicore failures. Allow # retries with single core try: do.run( cmd.format(**locals()), "Sort BAM file (multi core, %s): %s to %s" % (order, os.path.basename(in_bam), os.path.basename(sort_file))) except: logger.exception( "Multi-core sorting failed, reverting to single core") resources = config_utils.get_resources("samtools", config) mem = resources.get("memory", "2G") cores = 1 order_flag = "-n" if order == "queryname" else "" do.run( samtools_cmd.format(**locals()), "Sort BAM file (single core, %s): %s to %s" % (order, os.path.basename(in_bam), os.path.basename(sort_file))) return sort_file
def _run_purecn(paired, work_dir): """Run PureCN.R wrapper with pre-segmented CNVkit or GATK4 inputs. """ segfns = { "cnvkit": _segment_normalized_cnvkit, "gatk-cnv": _segment_normalized_gatk } out_base, out, all_files = _get_purecn_files(paired, work_dir) failed_file = out_base + "-failed.log" cnr_file = tz.get_in(["depth", "bins", "normalized"], paired.tumor_data) if not utils.file_uptodate( out["rds"], cnr_file) and not utils.file_exists(failed_file): cnr_file, seg_file = segfns[cnvkit.bin_approach(paired.tumor_data)]( cnr_file, work_dir, paired) from bcbio import heterogeneity vcf_file = heterogeneity.get_variants( paired.tumor_data, include_germline=False)[0]["vrn_file"] vcf_file = germline.filter_to_pass_and_reject(vcf_file, paired, out_dir=work_dir) with file_transaction(paired.tumor_data, out_base) as tx_out_base: # Use UCSC style naming for human builds to support BSgenome genome = ("hg19" if dd.get_genome_build(paired.tumor_data) in [ "GRCh37", "hg19" ] else dd.get_genome_build(paired.tumor_data)) cmd = [ "PureCN.R", "--seed", "42", "--out", tx_out_base, "--rds", "%s.rds" % tx_out_base, "--sampleid", dd.get_sample_name(paired.tumor_data), "--genome", genome, "--vcf", vcf_file, "--tumor", cnr_file, "--segfile", seg_file, "--funsegmentation", "Hclust", "--maxnonclonal", "0.3" ] if dd.get_num_cores(paired.tumor_data) > 1: cmd += ["--cores", str(dd.get_num_cores(paired.tumor_data))] try: cmd = "export R_LIBS_USER=%s && %s && %s" % ( utils.R_sitelib(), utils.get_R_exports(), " ".join( [str(x) for x in cmd])) do.run(cmd, "PureCN copy number calling") except subprocess.CalledProcessError as msg: if _allowed_errors(str(msg)): logger.info( "PureCN failed to find solution for %s: skipping" % dd.get_sample_name(paired.tumor_data)) with open(failed_file, "w") as out_handle: out_handle.write(str(msg)) else: logger.exception() raise for f in all_files: if os.path.exists(os.path.join(os.path.dirname(tx_out_base), f)): shutil.move(os.path.join(os.path.dirname(tx_out_base), f), os.path.join(os.path.dirname(out_base), f)) out = _get_purecn_files(paired, work_dir, require_exist=True)[1] return out if (out.get("rds") and os.path.exists(out["rds"])) else None
def _bgzip_from_bam(bam_file, dirs, config, is_retry=False, output_infix=''): """Create bgzipped fastq files from an input BAM file. """ # tools bamtofastq = config_utils.get_program("bamtofastq", config) resources = config_utils.get_resources("bamtofastq", config) cores = config["algorithm"].get("num_cores", 1) max_mem = config_utils.convert_to_bytes(resources.get("memory", "1G")) * cores bgzip = tools.get_bgzip_cmd(config, is_retry) # files work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep")) out_file_1 = os.path.join( work_dir, "%s%s-1.fq.gz" % (os.path.splitext(os.path.basename(bam_file))[0], output_infix)) out_file_2 = out_file_1.replace("-1.fq.gz", "-2.fq.gz") needs_retry = False if is_retry or not utils.file_exists(out_file_1): if not bam.is_paired(bam_file): out_file_2 = None with file_transaction(config, out_file_1) as tx_out_file: for f in [tx_out_file, out_file_1, out_file_2]: if f and os.path.exists(f): os.remove(f) fq1_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, tx_out_file) sortprefix = "%s-sort" % os.path.splitext(tx_out_file)[0] if bam.is_paired(bam_file): fq2_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, out_file_2) out_str = ( "F=>({fq1_bgzip_cmd}) F2=>({fq2_bgzip_cmd}) S=/dev/null O=/dev/null " "O2=/dev/null collate=1 colsbs={max_mem}") else: out_str = "S=>({fq1_bgzip_cmd})" bam_file = objectstore.cl_input(bam_file) cmd = "{bamtofastq} filename={bam_file} T={sortprefix} " + out_str try: do.run(cmd.format(**locals()), "BAM to bgzipped fastq", checks=[do.file_reasonable_size(tx_out_file, bam_file)], log_error=False) except subprocess.CalledProcessError as msg: if not is_retry and "deflate failed" in str(msg): logger.info( "bamtofastq deflate IO failure preparing %s. Retrying with single core." % (bam_file)) needs_retry = True else: logger.exception() raise if needs_retry: return _bgzip_from_bam(bam_file, dirs, config, is_retry=True) else: return [ x for x in [out_file_1, out_file_2] if x is not None and utils.file_exists(x) ]
def _bgzip_from_bam(bam_file, dirs, data, is_retry=False, output_infix=''): """Create bgzipped fastq files from an input BAM file. """ # tools config = data["config"] bamtofastq = config_utils.get_program("bamtofastq", config) resources = config_utils.get_resources("bamtofastq", config) cores = config["algorithm"].get("num_cores", 1) max_mem = config_utils.convert_to_bytes(resources.get("memory", "1G")) * cores bgzip = tools.get_bgzip_cmd(config, is_retry) # files work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep")) out_file_1 = os.path.join(work_dir, "%s%s-1.fq.gz" % (os.path.splitext(os.path.basename(bam_file))[0], output_infix)) out_file_2 = out_file_1.replace("-1.fq.gz", "-2.fq.gz") needs_retry = False if is_retry or not utils.file_exists(out_file_1): if not bam.is_paired(bam_file): out_file_2 = None with file_transaction(config, out_file_1) as tx_out_file: for f in [tx_out_file, out_file_1, out_file_2]: if f and os.path.exists(f): os.remove(f) fq1_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, tx_out_file) prep_cmd = _seqtk_fastq_prep_cl(data, read_num=0) if prep_cmd: fq1_bgzip_cmd = prep_cmd + " | " + fq1_bgzip_cmd sortprefix = "%s-sort" % os.path.splitext(tx_out_file)[0] if bam.is_paired(bam_file): prep_cmd = _seqtk_fastq_prep_cl(data, read_num=1) fq2_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, out_file_2) if prep_cmd: fq2_bgzip_cmd = prep_cmd + " | " + fq2_bgzip_cmd out_str = ("F=>({fq1_bgzip_cmd}) F2=>({fq2_bgzip_cmd}) S=/dev/null O=/dev/null " "O2=/dev/null collate=1 colsbs={max_mem}") else: out_str = "S=>({fq1_bgzip_cmd})" bam_file = objectstore.cl_input(bam_file) extra_opts = " ".join([str(x) for x in resources.get("options", [])]) cmd = "{bamtofastq} filename={bam_file} T={sortprefix} {extra_opts} " + out_str try: do.run(cmd.format(**locals()), "BAM to bgzipped fastq", checks=[do.file_reasonable_size(tx_out_file, bam_file)], log_error=False) except subprocess.CalledProcessError as msg: if not is_retry and "deflate failed" in str(msg): logger.info("bamtofastq deflate IO failure preparing %s. Retrying with single core." % (bam_file)) needs_retry = True else: logger.exception() raise if needs_retry: return _bgzip_from_bam(bam_file, dirs, data, is_retry=True) else: return [x for x in [out_file_1, out_file_2] if x is not None and utils.file_exists(x)]
def sort(in_bam, config, order="coordinate"): """Sort a BAM file, skipping if already present. """ assert is_bam(in_bam), "%s in not a BAM file" % in_bam if bam_already_sorted(in_bam, config, order): return in_bam sort_stem = _get_sort_stem(in_bam, order) sort_file = sort_stem + ".bam" if not utils.file_exists(sort_file): sambamba = _get_sambamba(config) samtools = config_utils.get_program("samtools", config) cores = config["algorithm"].get("num_cores", 1) with file_transaction(config, sort_file) as tx_sort_file: tx_sort_stem = os.path.splitext(tx_sort_file)[0] tx_dir = utils.safe_makedir(os.path.dirname(tx_sort_file)) order_flag = "-n" if order == "queryname" else "" resources = config_utils.get_resources("samtools", config) mem = resources.get("memory", "2G") samtools_cmd = ("{samtools} sort -@ {cores} -m {mem} {order_flag} " "{in_bam} {tx_sort_stem}") if sambamba: if tz.get_in(["resources", "sambamba"], config): sm_resources = config_utils.get_resources("sambamba", config) mem = sm_resources.get("memory", "2G") # sambamba uses total memory, not memory per core mem = config_utils.adjust_memory(mem, cores, "increase").upper() # Use samtools compatible natural sorting # https://github.com/lomereiter/sambamba/issues/132 order_flag = "--natural-sort" if order == "queryname" else "" cmd = ("{sambamba} sort -t {cores} -m {mem} {order_flag} " "-o {tx_sort_file} --tmpdir={tx_dir} {in_bam}") else: cmd = samtools_cmd # sambamba has intermittent multicore failures. Allow # retries with single core try: do.run(cmd.format(**locals()), "Sort BAM file (multi core, %s): %s to %s" % (order, os.path.basename(in_bam), os.path.basename(sort_file))) except: logger.exception("Multi-core sorting failed, reverting to single core") resources = config_utils.get_resources("samtools", config) mem = resources.get("memory", "2G") cores = 1 order_flag = "-n" if order == "queryname" else "" do.run(samtools_cmd.format(**locals()), "Sort BAM file (single core, %s): %s to %s" % (order, os.path.basename(in_bam), os.path.basename(sort_file))) return sort_file
def process(args): """Run the function in args.name given arguments in args.argfile. """ # Set environment to standard to use periods for decimals and avoid localization os.environ["LC_ALL"] = "C" os.environ["LC"] = "C" os.environ["LANG"] = "C" try: fn = getattr(multitasks, args.name) except AttributeError: raise AttributeError( "Did not find exposed function in bcbio.distributed.multitasks named '%s'" % args.name) if args.moreargs or args.raw: fnargs = [args.argfile] + args.moreargs work_dir = None argfile = None else: with open(args.argfile) as in_handle: fnargs = yaml.safe_load(in_handle) work_dir = os.path.dirname(args.argfile) fnargs = config_utils.merge_resources(fnargs) argfile = args.outfile if args.outfile else "%s-out%s" % os.path.splitext( args.argfile) if not work_dir: work_dir = os.getcwd() if len(fnargs) > 0 and fnargs[0] == "cwl": fnargs, parallel, out_keys = _world_from_cwl(args.name, fnargs[1:], work_dir) # Can remove this awkward Docker merge when we do not need custom GATK3 installs fnargs = config_utils.merge_resources(fnargs) argfile = os.path.join(work_dir, "cwl.output.json") else: parallel, out_keys = None, {} with utils.chdir(work_dir): with contextlib.closing( log.setup_local_logging(parallel={"wrapper": "runfn"})): try: out = fn(fnargs) except: logger.exception() raise if argfile: try: _write_out_argfile(argfile, out, fnargs, parallel, out_keys, work_dir) except: logger.exception() raise if argfile.endswith(".json"): _write_wdl_outputs(argfile, out_keys)
def process(args): """Run the function in args.name given arguments in args.argfile. """ # Set environment to standard to use periods for decimals and avoid localization os.environ["LC_ALL"] = "C" os.environ["LC"] = "C" os.environ["LANG"] = "C" setpath.prepend_bcbiopath() try: fn = getattr(multitasks, args.name) except AttributeError: raise AttributeError("Did not find exposed function in bcbio.distributed.multitasks named '%s'" % args.name) if args.moreargs or args.raw: fnargs = [args.argfile] + args.moreargs work_dir = None argfile = None else: with open(args.argfile) as in_handle: fnargs = yaml.safe_load(in_handle) work_dir = os.path.dirname(args.argfile) fnargs = config_utils.merge_resources(fnargs) argfile = args.outfile if args.outfile else "%s-out%s" % os.path.splitext(args.argfile) if not work_dir: work_dir = os.getcwd() if len(fnargs) > 0 and fnargs[0] == "cwl": fnargs, parallel, out_keys, input_files = _world_from_cwl(args.name, fnargs[1:], work_dir) # Can remove this awkward Docker merge when we do not need custom GATK3 installs fnargs = config_utils.merge_resources(fnargs) argfile = os.path.join(work_dir, "cwl.output.json") else: parallel, out_keys, input_files = None, {}, [] with utils.chdir(work_dir): with contextlib.closing(log.setup_local_logging(parallel={"wrapper": "runfn"})): try: out = fn(*fnargs) except: logger.exception() raise finally: # Clean up any copied and unpacked workflow inputs, avoiding extra disk usage wf_input_dir = os.path.join(work_dir, "wf-inputs") if os.path.exists(wf_input_dir) and os.path.isdir(wf_input_dir): shutil.rmtree(wf_input_dir) if argfile: try: _write_out_argfile(argfile, out, fnargs, parallel, out_keys, input_files, work_dir) except: logger.exception() raise
def gatk_indel_realignment(runner, align_bam, ref_file, intervals, region=None, out_file=None, deep_coverage=False): """Perform realignment of BAM file in specified regions """ if out_file is None: out_file = "%s-realign.bam" % os.path.splitext(align_bam)[0] if not file_exists(out_file): with curdir_tmpdir() as tmp_dir: with file_transaction(out_file) as tx_out_file: logger.info("GATK IndelRealigner: %s %s" % (os.path.basename(align_bam), region)) params = [ "-T", "IndelRealigner", "-I", align_bam, "-R", ref_file, "-targetIntervals", intervals, "-o", tx_out_file, "-l", "INFO", ] if region: params += ["-L", region] if deep_coverage: params += [ "--maxReadsInMemory", "300000", "--maxReadsForRealignment", str(int(5e5)), "--maxReadsForConsensuses", "500", "--maxConsensuses", "100" ] try: runner.run_gatk(params, tmp_dir) except: logger.exception( "Running GATK IndelRealigner failed: {} {}".format( os.path.basename(align_bam), region)) raise return out_file
def run(cmd, descr, data=None, checks=None, region=None, log_error=True): """Run the provided command, logging details and checking for errors. """ descr = _descr_str(descr, data, region) logger.debug(descr) # TODO: Extract entity information from data input cmd_id = diagnostics.start_cmd(descr, data, cmd) try: logger_cl.debug(" ".join(cmd) if not isinstance(cmd, basestring) else cmd) _do_run(cmd, checks) except: diagnostics.end_cmd(cmd_id, False) if log_error: logger.exception() raise finally: diagnostics.end_cmd(cmd_id)
def run(cmd, descr, data=None, checks=None): """Run the provided command, logging details and checking for errors. """ if data: descr = "{0} : {1}".format(descr, data["name"][-1]) logger.debug(descr) # TODO: Extract entity information from data input cmd_id = diagnostics.start_cmd(descr, data, cmd) try: logger_cl.debug(" ".join(cmd) if not isinstance(cmd, basestring) else cmd) _do_run(cmd, checks) except: diagnostics.end_cmd(cmd_id, False) logger.exception() raise finally: diagnostics.end_cmd(cmd_id)
def _setup_logging(args): if len(args) > 0: for check_i in [0, -1]: config = args[0][check_i] if isinstance(config, dict) and config.has_key("config"): config = config["config"] break elif isinstance(config, dict) and config.has_key("algorithm"): break else: config = None setup_logging(config) try: yield None except: logger.exception("Unexpected error") raise
def run(cmd, descr, data=None, checks=None, region=None, log_error=True, log_stdout=False): """Run the provided command, logging details and checking for errors. """ descr = _descr_str(descr, data, region) logger.debug(descr) cmd_id = diagnostics.start_cmd(cmd, descr, data) try: logger_cl.debug(" ".join(str(x) for x in cmd) if not isinstance(cmd, basestring) else cmd) _do_run(cmd, checks, log_stdout) except: diagnostics.end_cmd(cmd_id, False) if log_error: logger.exception() raise finally: diagnostics.end_cmd(cmd_id)
def run(cmd, descr, data, checks=None): """Run the provided command, logging details and checking for errors. """ if data: descr = "{0} : {1}".format(descr, data["name"][-1]) logger.debug(descr) # TODO: Extract entity information from data input cmd_id = diagnostics.start_cmd(descr, data, cmd) try: logger_cl.debug( " ".join(cmd) if not isinstance(cmd, basestring) else cmd) _do_run(cmd, checks) except: diagnostics.end_cmd(cmd_id, False) logger.exception() raise finally: diagnostics.end_cmd(cmd_id)
def run(cmd, descr, data=None, checks=None, region=None, log_error=True, log_stdout=False): """Run the provided command, logging details and checking for errors. """ descr = _descr_str(descr, data, region) logger.debug(descr) # TODO: Extract entity information from data input cmd_id = diagnostics.start_cmd(descr, data, cmd) try: logger_cl.debug(" ".join(str(x) for x in cmd) if not isinstance(cmd, basestring) else cmd) _do_run(cmd, checks, log_stdout) except: diagnostics.end_cmd(cmd_id, False) if log_error: logger.exception() raise finally: diagnostics.end_cmd(cmd_id)
def _run_smoove(full_bams, sr_bams, disc_bams, work_dir, items): """Run lumpy-sv using smoove. """ batch = sshared.get_cur_batch(items) ext = "-%s-svs" % batch if batch else "-svs" name = "%s%s" % (dd.get_sample_name(items[0]), ext) out_file = os.path.join(work_dir, "%s-smoove.genotyped.vcf.gz" % name) sv_exclude_bed = sshared.prepare_exclude_file(items, out_file) old_out_file = os.path.join(work_dir, "%s%s-prep.vcf.gz" % (os.path.splitext(os.path.basename(items[0]["align_bam"]))[0], ext)) if utils.file_exists(old_out_file): return old_out_file, sv_exclude_bed if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: cores = dd.get_num_cores(items[0]) out_dir = os.path.dirname(tx_out_file) ref_file = dd.get_ref_file(items[0]) full_bams = " ".join(_prepare_smoove_bams(full_bams, sr_bams, disc_bams, items, os.path.dirname(tx_out_file))) std_excludes = ["~^GL", "~^HLA", "~_random", "~^chrUn", "~alt", "~decoy"] def _is_std_exclude(n): clean_excludes = [x.replace("~", "").replace("^", "") for x in std_excludes] return any([n.startswith(x) or n.endswith(x) for x in clean_excludes]) exclude_chrs = [c.name for c in ref.file_contigs(ref_file) if not chromhacks.is_nonalt(c.name) and not _is_std_exclude(c.name)] exclude_chrs = "--excludechroms '%s'" % ",".join(std_excludes + exclude_chrs) exclude_bed = ("--exclude %s" % sv_exclude_bed) if utils.file_exists(sv_exclude_bed) else "" tempdir = os.path.dirname(tx_out_file) cmd = ("export TMPDIR={tempdir} && " "smoove call --processes {cores} --genotype --removepr --fasta {ref_file} " "--name {name} --outdir {out_dir} " "{exclude_bed} {exclude_chrs} {full_bams}") with utils.chdir(tempdir): try: do.run(cmd.format(**locals()), "smoove lumpy calling", items[0]) except subprocess.CalledProcessError as msg: if _allowed_errors(str(msg)): vcfutils.write_empty_vcf(tx_out_file, config=items[0]["config"], samples=[dd.get_sample_name(d) for d in items]) else: logger.exception() raise vcfutils.bgzip_and_index(out_file, items[0]["config"]) return out_file, sv_exclude_bed
def _run_smoove(full_bams, sr_bams, disc_bams, work_dir, items): """Run lumpy-sv using smoove. """ batch = sshared.get_cur_batch(items) ext = "-%s-svs" % batch if batch else "-svs" name = "%s%s" % (dd.get_sample_name(items[0]), ext) out_file = os.path.join(work_dir, "%s-smoove.genotyped.vcf.gz" % name) sv_exclude_bed = sshared.prepare_exclude_file(items, out_file) old_out_file = os.path.join(work_dir, "%s%s-prep.vcf.gz" % (os.path.splitext(os.path.basename(items[0]["align_bam"]))[0], ext)) if utils.file_exists(old_out_file): return old_out_file, sv_exclude_bed if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: cores = dd.get_num_cores(items[0]) out_dir = os.path.dirname(tx_out_file) ref_file = dd.get_ref_file(items[0]) full_bams = " ".join(_prepare_smoove_bams(full_bams, sr_bams, disc_bams, items, os.path.dirname(tx_out_file))) std_excludes = ["~^GL", "~^HLA", "~_random", "~^chrUn", "~alt", "~decoy"] def _is_std_exclude(n): clean_excludes = [x.replace("~", "").replace("^", "") for x in std_excludes] return any([n.startswith(x) or n.endswith(x) for x in clean_excludes]) exclude_chrs = [c.name for c in ref.file_contigs(ref_file) if not chromhacks.is_nonalt(c.name) and not _is_std_exclude(c.name)] exclude_chrs = "--excludechroms '%s'" % ",".join(std_excludes + exclude_chrs) exclude_bed = ("--exclude %s" % sv_exclude_bed) if utils.file_exists(sv_exclude_bed) else "" tempdir = os.path.dirname(tx_out_file) cmd = ("export TMPDIR={tempdir} && " "smoove call --processes {cores} --genotype --removepr --fasta {ref_file} " "--name {name} --outdir {out_dir} " "{exclude_bed} {exclude_chrs} {full_bams}") with utils.chdir(tempdir): try: do.run(cmd.format(**locals()), "smoove lumpy calling", items[0]) except subprocess.CalledProcessError as msg: if _allowed_errors(msg): vcfutils.write_empty_vcf(tx_out_file, config=items[0]["config"], samples=[dd.get_sample_name(d) for d in items]) else: logger.exception() raise vcfutils.bgzip_and_index(out_file, items[0]["config"]) return out_file, sv_exclude_bed
def process(args): """Run the function in args.name given arguments in args.argfile. """ # Set environment to standard to use periods for decimals and avoid localization os.environ["LC_ALL"] = "C" os.environ["LC"] = "C" os.environ["LANG"] = "C" try: fn = getattr(multitasks, args.name) except AttributeError: raise AttributeError("Did not find exposed function in bcbio.distributed.multitasks named '%s'" % args.name) if args.moreargs or args.raw: fnargs = [args.argfile] + args.moreargs work_dir = None argfile = None else: with open(args.argfile) as in_handle: fnargs = yaml.safe_load(in_handle) work_dir = os.path.dirname(args.argfile) fnargs = config_utils.merge_resources(fnargs) argfile = args.outfile if args.outfile else "%s-out%s" % os.path.splitext(args.argfile) if not work_dir: work_dir = os.getcwd() if len(fnargs) > 0 and fnargs[0] == "cwl": fnargs, parallel, out_keys = _world_from_cwl(args.name, fnargs[1:], work_dir) fnargs = config_utils.merge_resources(fnargs) argfile = os.path.join(work_dir, "cwl.output.json") else: parallel, out_keys = None, {} with utils.chdir(work_dir): with contextlib.closing(log.setup_local_logging(parallel={"wrapper": "runfn"})): try: out = fn(fnargs) except: logger.exception() raise if argfile: try: _write_out_argfile(argfile, out, fnargs, parallel, out_keys, work_dir) except: logger.exception() raise if argfile.endswith(".json"): _write_wdl_outputs(argfile, out_keys)
def _bgzip_from_bam(bam_file, dirs, config, is_retry=False): """Create bgzipped fastq files from an input BAM file. """ # tools bamtofastq = config_utils.get_program("bamtofastq", config) resources = config_utils.get_resources("bamtofastq", config) cores = config["algorithm"].get("num_cores", 1) max_mem = int(resources.get("memory", "1073741824")) * cores # 1Gb/core default bgzip = tools.get_bgzip_cmd(config, is_retry) # files work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep")) out_file_1 = os.path.join(work_dir, "%s-1.fq.gz" % os.path.splitext(os.path.basename(bam_file))[0]) if bam.is_paired(bam_file): out_file_2 = out_file_1.replace("-1.fq.gz", "-2.fq.gz") else: out_file_2 = None needs_retry = False if is_retry or not utils.file_exists(out_file_1): with file_transaction(config, out_file_1) as tx_out_file: for f in [tx_out_file, out_file_1, out_file_2]: if f and os.path.exists(f): os.remove(f) fq1_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, tx_out_file) sortprefix = "%s-sort" % os.path.splitext(tx_out_file)[0] if bam.is_paired(bam_file): fq2_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, out_file_2) out_str = ("F=>({fq1_bgzip_cmd}) F2=>({fq2_bgzip_cmd}) S=/dev/null O=/dev/null " "O2=/dev/null collate=1 colsbs={max_mem}") else: out_str = "S=>({fq1_bgzip_cmd})" bam_file = objectstore.cl_input(bam_file) cmd = "{bamtofastq} filename={bam_file} T={sortprefix} " + out_str try: do.run(cmd.format(**locals()), "BAM to bgzipped fastq", checks=[do.file_reasonable_size(tx_out_file, bam_file)], log_error=False) except subprocess.CalledProcessError, msg: if not is_retry and "deflate failed" in str(msg): logger.info("bamtofastq deflate IO failure preparing %s. Retrying with single core." % (bam_file)) needs_retry = True else: logger.exception() raise
def _run_purecn(paired, work_dir): """Run PureCN.R wrapper with pre-segmented CNVkit or GATK4 inputs. """ segfns = {"cnvkit": _segment_normalized_cnvkit, "gatk-cnv": _segment_normalized_gatk} out_base, out, all_files = _get_purecn_files(paired, work_dir) failed_file = out_base + "-failed.log" cnr_file = tz.get_in(["depth", "bins", "normalized"], paired.tumor_data) if not utils.file_uptodate(out["rds"], cnr_file) and not utils.file_exists(failed_file): cnr_file, seg_file = segfns[cnvkit.bin_approach(paired.tumor_data)](cnr_file, work_dir, paired) from bcbio import heterogeneity vcf_file = heterogeneity.get_variants(paired.tumor_data, include_germline=False)[0]["vrn_file"] vcf_file = germline.filter_to_pass_and_reject(vcf_file, paired, out_dir=work_dir) with file_transaction(paired.tumor_data, out_base) as tx_out_base: # Use UCSC style naming for human builds to support BSgenome genome = ("hg19" if dd.get_genome_build(paired.tumor_data) in ["GRCh37", "hg19"] else dd.get_genome_build(paired.tumor_data)) cmd = ["PureCN.R", "--seed", "42", "--out", tx_out_base, "--rds", "%s.rds" % tx_out_base, "--sampleid", dd.get_sample_name(paired.tumor_data), "--genome", genome, "--vcf", vcf_file, "--tumor", cnr_file, "--segfile", seg_file, "--funsegmentation", "Hclust", "--maxnonclonal", "0.3"] if dd.get_num_cores(paired.tumor_data) > 1: cmd += ["--cores", str(dd.get_num_cores(paired.tumor_data))] try: cmd = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(), utils.get_R_exports(), " ".join([str(x) for x in cmd])) do.run(cmd, "PureCN copy number calling") except subprocess.CalledProcessError as msg: if _allowed_errors(str(msg)): logger.info("PureCN failed to find solution for %s: skipping" % dd.get_sample_name(paired.tumor_data)) with open(failed_file, "w") as out_handle: out_handle.write(str(msg)) else: logger.exception() raise for f in all_files: if os.path.exists(os.path.join(os.path.dirname(tx_out_base), f)): shutil.move(os.path.join(os.path.dirname(tx_out_base), f), os.path.join(os.path.dirname(out_base), f)) out = _get_purecn_files(paired, work_dir, require_exist=True)[1] return out if (out.get("rds") and os.path.exists(out["rds"])) else None
def _setup_logging(args): config = None if len(args) == 1 and isinstance(args[0], (list, tuple)): args = args[0] for arg in args: if ipython.is_nested_config_arg(arg): config = arg["config"] break elif ipython.is_std_config_arg(arg): config = arg break if config is not None: setup_logging(config) else: raise NotImplementedError("No config in %s:" % args[0]) try: yield None except: logger.exception("Unexpected error") raise
def process(args): """Run the function in args.name given arguments in args.argfile. """ try: fn = getattr(multitasks, args.name) except AttributeError: raise AttributeError( "Did not find exposed function in bcbio.distributed.multitasks named '%s'" % args.name) if args.moreargs or args.raw: fnargs = [args.argfile] + args.moreargs work_dir = None argfile = None else: with open(args.argfile) as in_handle: fnargs = yaml.safe_load(in_handle) work_dir = os.path.dirname(args.argfile) fnargs = config_utils.merge_resources(fnargs) argfile = args.outfile if args.outfile else "%s-out%s" % os.path.splitext( args.argfile) if not work_dir: work_dir = os.getcwd() if len(fnargs) > 0 and fnargs[0] == "cwl": fnargs, parallel = _world_from_cwl(fnargs[1:], work_dir) fnargs = config_utils.merge_resources(fnargs) argfile = os.path.join(work_dir, "cwl.output.json") else: parallel = None with utils.chdir(work_dir): log.setup_local_logging(parallel={"wrapper": "runfn"}) try: out = fn(fnargs) except: logger.exception() raise if argfile: try: _write_out_argfile(argfile, out, fnargs, parallel, work_dir) except: logger.exception() raise
def _run_bubbletree(vcf_csv, cnv_csv, data): """Create R script and run on input data """ local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") base = utils.splitext_plus(vcf_csv)[0] r_file = "%s-run.R" % base bubbles_out = "%s-bubbles.pdf" % base prev_model_out = "%s-bubbletree_prev_model.pdf" % base freqs_out = "%s-bubbletree_prevalence.txt" % base with open(r_file, "w") as out_handle: out_handle.write(_script.format(**locals())) if not utils.file_exists(freqs_out): try: do.run(["Rscript", r_file], "Assess heterogeneity with BubbleTree") except subprocess.CalledProcessError, msg: if _allowed_bubbletree_errorstates(str(msg)): with open(freqs_out, "w") as out_handle: out_handle.write('bubbletree failed:\n %s"\n' % (str(msg))) else: logger.exception() raise
def process(args): """Run the function in args.name given arguments in args.argfile. """ try: fn = getattr(multitasks, args.name) except AttributeError: raise AttributeError("Did not find exposed function in bcbio.distributed.multitasks named '%s'" % args.name) if args.moreargs or args.raw: fnargs = [args.argfile] + args.moreargs work_dir = None argfile = None else: with open(args.argfile) as in_handle: fnargs = yaml.safe_load(in_handle) work_dir = os.path.dirname(args.argfile) fnargs = config_utils.merge_resources(fnargs) argfile = args.outfile if args.outfile else "%s-out%s" % os.path.splitext(args.argfile) if not work_dir: work_dir = os.getcwd() if len(fnargs) > 0 and fnargs[0] == "cwl": fnargs, parallel = _world_from_cwl(fnargs[1:], work_dir) fnargs = config_utils.merge_resources(fnargs) argfile = os.path.join(work_dir, "cwl.output.json") else: parallel = None with utils.chdir(work_dir): log.setup_local_logging(parallel={"wrapper": "runfn"}) try: out = fn(fnargs) except: logger.exception() raise if argfile: try: _write_out_argfile(argfile, out, fnargs, parallel, work_dir) except: logger.exception() raise
def _get_machine_info(parallel, sys_config, dirs, config): """Get machine resource information from the job scheduler via either the command line or the queue. """ if parallel.get("queue") and parallel.get("scheduler"): # dictionary as switch statement; can add new scheduler implementation functions as (lowercase) keys sched_info_dict = { "slurm": _slurm_info, "torque": _torque_info, "sge": _sge_info } if parallel["scheduler"].lower() in sched_info_dict: try: return sched_info_dict[parallel["scheduler"].lower()](parallel.get("queue", "")) except: # If something goes wrong, just hit the queue logger.exception("Couldn't get machine information from resource query function for queue " "'{0}' on scheduler \"{1}\"; " "submitting job to queue".format(parallel.get("queue", ""), parallel["scheduler"])) else: logger.info("Resource query function not implemented for scheduler \"{0}\"; " "submitting job to queue".format(parallel["scheduler"])) from bcbio.distributed import prun with prun.start(parallel, [[sys_config]], config, dirs) as run_parallel: return run_parallel("machine_info", [[sys_config]])
def stop(view): try: ipython_cluster.stop_from_view(view) time.sleep(10) except: logger.exception("Did not stop IPython cluster correctly")