def detect_fusions(data): data = to_single_data(data) # support the old style of fusion mode calling if dd.get_fusion_mode(data, False): data = dd.set_fusion_caller(data, ["oncofuse", "pizzly"]) logger.warning( "``fusion_mode`` is deprecated in favor of turning on " "callers with ``fusion_caller``. It will run pizzly and " "oncofuse for now, but will eventually have support " "dropped.") fusion_caller = dd.get_fusion_caller(data, []) if "oncofuse" in fusion_caller: oncofuse_file = oncofuse.run(data) if oncofuse_file: data = dd.set_oncofuse_file(data, oncofuse_file) if "pizzly" in fusion_caller: pizzly_dir = pizzly.run_pizzly(data) if pizzly_dir: data = dd.set_pizzly_dir(data, pizzly_dir) data["fusion"] = { "fasta": os.path.join(pizzly_dir, "%s.fusions.fasta" % dd.get_sample_name(data)), "json": os.path.join(pizzly_dir, "%s.json" % dd.get_sample_name(data)) } if "ericscript" in fusion_caller: ericscript_dir = ericscript.run(data) return [[data]]
def _find_mirge(data): try: mirge = config_utils.get_program("miRge2.0", data) return mirge except config_utils.CmdNotFound: logger.warning("miRge2.0 is not found. Install it first, and try again.") return None
def _mirtop(input_fn, sps, db, out_dir, config): """ Convert to GFF3 standard format """ hairpin = os.path.join(db, "hairpin.fa") gtf = os.path.join(db, "mirbase.gff3") if not file_exists(hairpin) or not file_exists(gtf): logger.warning("%s or %s are not installed. Skipping." % (hairpin, gtf)) return None out_gtf_fn = "%s.gtf" % utils.splitext_plus(os.path.basename(input_fn))[0] out_gff_fn = "%s.gff" % utils.splitext_plus(os.path.basename(input_fn))[0] export = _get_env() cmd = ("{export} mirtop gff --sps {sps} --hairpin {hairpin} " "--gtf {gtf} --format seqbuster -o {out_tx} {input_fn}") if not file_exists(os.path.join(out_dir, out_gtf_fn)) and \ not file_exists(os.path.join(out_dir, out_gff_fn)): with tx_tmpdir() as out_tx: do.run(cmd.format(**locals()), "Do miRNA annotation for %s" % input_fn) with utils.chdir(out_tx): out_fn = out_gtf_fn if utils.file_exists(out_gtf_fn) \ else out_gff_fn if utils.file_exists(out_fn): shutil.move(os.path.join(out_tx, out_fn), os.path.join(out_dir, out_fn)) out_fn = out_gtf_fn if utils.file_exists(os.path.join(out_dir, out_gtf_fn)) \ else os.path.join(out_dir, out_gff_fn) if utils.file_exists(os.path.join(out_dir, out_fn)): return os.path.join(out_dir, out_fn)
def clean_inputs(data): """Clean BED input files to avoid overlapping segments that cause downstream issues. Per-merges inputs to avoid needing to call multiple times during later parallel steps. """ if not utils.get_in(data, ("config", "algorithm", "variant_regions_orig")): data["config"]["algorithm"]["variant_regions_orig"] = dd.get_variant_regions(data) clean_vr = clean_file(dd.get_variant_regions(data), data) merged_vr = merge_overlaps(clean_vr, data) data["config"]["algorithm"]["variant_regions"] = clean_vr data["config"]["algorithm"]["variant_regions_merged"] = merged_vr if dd.get_coverage(data): if not utils.get_in(data, ("config", "algorithm", "coverage_orig")): data["config"]["algorithm"]["coverage_orig"] = dd.get_coverage(data) clean_cov_bed = clean_file(dd.get_coverage(data), data, prefix="cov-", simple=True) merged_cov_bed = merge_overlaps(clean_cov_bed, data) data["config"]["algorithm"]["coverage"] = clean_cov_bed data["config"]["algorithm"]["coverage_merged"] = merged_cov_bed if 'seq2c' in get_svcallers(data): seq2c_ready_bed = prep_seq2c_bed(data) if not seq2c_ready_bed: logger.warning("Can't run Seq2C without a svregions or variant_regions BED file") else: data["config"]["algorithm"]["seq2c_bed_ready"] = seq2c_ready_bed return data
def clean_inputs(data): """Clean BED input files to avoid overlapping segments that cause downstream issues. Per-merges inputs to avoid needing to call multiple times during later parallel steps. """ if not utils.get_in(data, ("config", "algorithm", "variant_regions_orig")): data["config"]["algorithm"][ "variant_regions_orig"] = dd.get_variant_regions(data) clean_vr = clean_file(dd.get_variant_regions(data), data) merged_vr = merge_overlaps(clean_vr, data) data["config"]["algorithm"]["variant_regions"] = clean_vr data["config"]["algorithm"]["variant_regions_merged"] = merged_vr if dd.get_coverage(data): if not utils.get_in(data, ("config", "algorithm", "coverage_orig")): data["config"]["algorithm"]["coverage_orig"] = dd.get_coverage( data) clean_cov_bed = clean_file(dd.get_coverage(data), data, prefix="cov-", simple=True) merged_cov_bed = merge_overlaps(clean_cov_bed, data) data["config"]["algorithm"]["coverage"] = clean_cov_bed data["config"]["algorithm"]["coverage_merged"] = merged_cov_bed if 'seq2c' in get_svcallers(data): seq2c_ready_bed = prep_seq2c_bed(data) if not seq2c_ready_bed: logger.warning( "Can't run Seq2C without a svregions or variant_regions BED file" ) else: data["config"]["algorithm"]["seq2c_bed_ready"] = seq2c_ready_bed return data
def summary(samples, config): """Provide summary information on a single sample across regions of interest. """ try: bc_jar = config_utils.get_jar( "bcbio.coverage", config_utils.get_program("bcbio_coverage", config, "dir")) except ValueError: logger.warning( "No coverage calculations: Did not find bcbio.coverage jar from system config" ) return [[x] for x in samples] config_file, out_file = _prep_coverage_config(samples, config) tmp_dir = utils.safe_makedir(os.path.join(os.path.dirname(out_file), "tmp")) resources = config_utils.get_resources("bcbio_coverage", config) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"]) java_args = ["-Djava.io.tmpdir=%s" % tmp_dir] cmd = ["java"] + jvm_opts + java_args + [ "-jar", bc_jar, "multicompare", config_file, out_file, "-c", str(config["algorithm"]["num_cores"]) ] do.run(cmd, "Summarizing coverage with bcbio.coverage", samples[0]) out = [] for x in samples: x["coverage"] = {"summary": out_file} out.append([x]) return out
def _get_samples_to_process(fn, out_dir, config, force_single): """parse csv file with one line per file. It will merge all files that have the same description name""" out_dir = os.path.abspath(out_dir) samples = defaultdict(list) with open(fn) as handle: for l in handle: cols = l.strip().split(",") if len(cols) > 0: if len(cols) < 2: raise ValueError("Line needs 2 values: file and name.") if utils.file_exists(cols[0]) or is_gsm(cols[0]): if cols[0].find(" ") > -1: new_name = os.path.abspath(cols[0].replace(" ", "_")) logger.warning("Space finds in %s. Linked to %s." % (cols[0], new_name)) logger.warning("Please, avoid names with spaces in the future.") utils.symlink_plus(os.path.abspath(cols[0]), new_name) cols[0] = new_name samples[cols[1]].append(cols) else: logger.info("skipping %s, File doesn't exist." % cols[0]) for sample, items in samples.items(): if is_fastq(items[0][0], True): fn = "fq_merge" ext = ".fastq.gz" elif is_bam(items[0][0]): fn = "bam_merge" ext = ".bam" elif is_gsm(items[0][0]): fn = "query_gsm" ext = ".fastq.gz" files = [os.path.abspath(fn_file[0]) if not is_gsm(fn_file[0]) else fn_file[0] for fn_file in items] samples[sample] = [{'files': _check_paired(files, force_single), 'out_file': os.path.join(out_dir, sample + ext), 'fn': fn, 'anno': items[0][2:], 'config': config, 'name': sample, 'out_dir': out_dir}] return [samples[sample] for sample in samples]
def summary(samples, config): """Provide summary information on a single sample across regions of interest. """ try: bc_jar = config_utils.get_jar("bcbio.coverage", config_utils.get_program("bcbio_coverage", config, "dir")) except ValueError: logger.warning("No coverage calculations: Did not find bcbio.coverage jar from system config") return [[x] for x in samples] config_file, out_file = _prep_coverage_config(samples, config) tmp_dir = utils.safe_makedir(os.path.join(os.path.dirname(out_file), "tmp")) resources = config_utils.get_resources("bcbio_coverage", config) config = copy.deepcopy(config) config["algorithm"]["memory_adjust"] = {"direction": "increase", "magnitude": config["algorithm"].get("num_cores", 1)} jvm_opts = config_utils.adjust_opts(resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"]), config) if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: java_args = ["-Djava.io.tmpdir=%s" % tmp_dir, "-Djava.awt.headless=true"] cmd = ["java"] + jvm_opts + java_args + ["-jar", bc_jar, "multicompare", config_file, tx_out_file, "-c", str(config["algorithm"].get("num_cores", 1))] do.run(cmd, "Summarizing coverage with bcbio.coverage", samples[0]) out = [] for x in samples: x["coverage"] = {"summary": out_file} out.append([x]) return out
def detect_fusions(samples): """Run fusion with a standalone tool, specified in config as fusion_caller. If fusion_mode is True, and no fusion_caller is specified, or fusion_caller == 'aligner', it is assumed that gene fusion detection was run on the alignment step. """ fusion_mode = dd.get_in_samples(samples, dd.get_fusion_mode) if not fusion_mode: return samples caller = dd.get_in_samples(samples, dd.get_fusion_caller) if not caller or caller == 'aligner': logger.info("No standalone fusion caller specified in the config.") return samples STANDALONE_CALLERS = { 'ericscript': ericscript.run, } caller_fn = STANDALONE_CALLERS.get(caller) if not caller_fn: logger.warning("Gene fusion detection with %s is not supported." "Supported callers:\n%s" % ', '.join(STANDALONE_CALLERS.keys())) return samples logger.info("Running gene fusion detection with %s" % caller) return [[caller_fn(s)] for s in dd.sample_data_iterator(samples)]
def run(bam_file, data, out_dir): out_base = os.path.join(utils.safe_makedir(out_dir), "%s-verifybamid" % (dd.get_sample_name(data))) out_file = out_base + ".selfSM" failed_file = out_base + ".failed" exts = [".out"] out = {} if not utils.file_exists(out_file) and not utils.file_exists(failed_file): with file_transaction(data, out_base) as tx_out_base: cmd = [ "verifybamid2", "1000g.phase3", "100k", "b38" if dd.get_genome_build(data) == "hg38" else "b37", "--Reference", dd.get_ref_file(data), "--Output", tx_out_base, "--DisableSanityCheck" ] cmd += _get_input_args(bam_file, data, out_base) try: do.run(cmd, "VerifyBamID contamination checks") except subprocess.CalledProcessError, msg: def allowed_errors(l): return ( l.find("Insufficient Available markers") >= 0 or l.find("No reads found in any of the regions") >= 0) if any([allowed_errors(l) for l in str(msg).split("\n")]): logger.info( "Skipping VerifyBamID, not enough overlapping markers found: %s" % dd.get_sample_name(data)) with open(failed_file, "w") as out_handle: out_handle.write(str(msg)) else: logger.warning(str(msg)) raise else: # Fix any sample name problems, for pileups shutil.move(tx_out_base + ".selfSM", tx_out_base + ".selfSM.orig") with open(tx_out_base + ".selfSM.orig") as in_handle: with open(tx_out_base + ".selfSM", "w") as out_handle: sample_name = None for line in in_handle: if line.startswith("DefaultSampleName"): line = line.replace("DefaultSampleName", dd.get_sample_name(data)) # work around bug in finding SM from BAM RG at end of line if len(line.strip().split("\t")) == 1: sample_name = line.strip() line = None elif sample_name: parts = line.split("\t") parts[0] = sample_name line = "\t".join(parts) sample_name = None if line: out_handle.write(line) for e in exts + [".selfSM"]: if os.path.exists(tx_out_base + e): shutil.copy(tx_out_base + e, out_base + e)
def clean_chipseq_alignment(data): # lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr")) method = dd.get_chip_method(data) if method == "atac": data = clean_ATAC(data) # for ATAC-seq, this will be the NF BAM work_bam = dd.get_work_bam(data) work_bam = bam.sort(work_bam, dd.get_config(data)) bam.index(work_bam, dd.get_config(data)) clean_bam = remove_nonassembled_chrom(work_bam, data) clean_bam = remove_mitochondrial_reads(clean_bam, data) data = atac.calculate_complexity_metrics(clean_bam, data) if not dd.get_keep_multimapped(data): clean_bam = remove_multimappers(clean_bam, data) if not dd.get_keep_duplicates(data): clean_bam = bam.remove_duplicates(clean_bam, data) data["work_bam"] = clean_bam encode_bed = tz.get_in( ["genome_resources", "variation", "encode_blacklist"], data) if encode_bed: data["work_bam"] = remove_blacklist_regions(dd.get_work_bam(data), encode_bed, data['config']) bam.index(data["work_bam"], data['config']) try: data["bigwig"] = _normalized_bam_coverage(dd.get_sample_name(data), dd.get_work_bam(data), data) except subprocess.CalledProcessError: logger.warning(f"{dd.get_work_bam(data)} was too sparse to normalize, " f" falling back to non-normalized coverage.") data["bigwig"] = _bam_coverage(dd.get_sample_name(data), dd.get_work_bam(data), data) return [[data]]
def _generate_estimates(bam_file, out_base, failed_file, exts, data): background = { "dataset": "1000g.phase3", "nvars": "100k", "build": "b38" if dd.get_genome_build(data) == "hg38" else "b37" } with file_transaction(data, out_base) as tx_out_base: cmd = [ "verifybamid2", background["dataset"], background["nvars"], background["build"], "--Reference", dd.get_ref_file(data), "--Output", tx_out_base ] cmd += _get_input_args(bam_file, data, out_base, background) try: do.run(cmd, "VerifyBamID contamination checks") except subprocess.CalledProcessError, msg: def allowed_errors(l): return (l.find("Insufficient Available markers") >= 0 or l.find("No reads found in any of the regions") >= 0) if any([allowed_errors(l) for l in str(msg).split("\n")]): logger.info( "Skipping VerifyBamID, not enough overlapping markers found: %s" % dd.get_sample_name(data)) with open(failed_file, "w") as out_handle: out_handle.write(str(msg)) else: logger.warning(str(msg)) raise else:
def run(data): config = data[0][0]['config'] work_dir = dd.get_work_dir(data[0][0]) genome = dd.get_ref_file(data[0][0]) mirdeep2 = os.path.join(os.path.dirname(sys.executable), "miRDeep2.pl") perl_exports = get_perl_exports() hairpin, mature, species = "none", "none", "na" rfam_file = dd.get_mirdeep2_file(data[0][0]) if file_exists(dd.get_mirbase_hairpin(data[0][0])): species = dd.get_species(data[0][0]) hairpin = dd.get_mirbase_hairpin(data[0][0]) mature = dd.get_mirbase_mature(data[0][0]) logger.debug("Preparing for mirdeep2 analysis.") bam_file = op.join(work_dir, "align", "seqs.bam") seqs_dir = op.join(work_dir, "seqcluster", "prepare") collapsed = op.join(seqs_dir, "seqs.ma") out_dir = op.join(work_dir, "mirdeep2") out_file = op.join(out_dir, "result_res.csv") safe_makedir(out_dir) with chdir(out_dir): collapsed, bam_file = _prepare_inputs(collapsed, bam_file, out_dir) cmd = ("{perl_exports} && perl {mirdeep2} {collapsed} {genome} {bam_file} {mature} none {hairpin} -f {rfam_file} -r simple -c -P -t {species} -z res").format(**locals()) if file_exists(mirdeep2) and not file_exists(out_file) and file_exists(rfam_file): try: do.run(cmd.format(**locals()), "Running mirdeep2.") except: logger.warning("mirdeep2 failed. Please report the error to https://github.com/lpantano/mirdeep2_core/issues.") if file_exists(out_file): novel_db = _parse_novel(out_file, dd.get_species(data[0][0])) return novel_db
def _find_lib(data): """Find mirge libs""" options = " ".join(data.get('resources', {}).get('mirge', {}).get("options", "")) if options.find("-lib") > -1 and utils.file_exists(options.split()[1]): return options if not libs: logger.warning("miRge libraries not found. Follow these instructions to install them:") return libs
def _generate_estimates(bam_file, out_base, failed_file, exts, data): background = { "dataset": "1000g.phase3", "nvars": "100k", "build": "b38" if dd.get_genome_build(data) == "hg38" else "b37" } with file_transaction(data, out_base) as tx_out_base: num_cores = dd.get_num_cores(data) cmd = [ "verifybamid2", background["dataset"], background["nvars"], background["build"], "--Reference", dd.get_ref_file(data), "--Output", tx_out_base, "--NumThread", num_cores ] cmd += _get_input_args(bam_file, data, out_base, background) try: do.run(cmd, "VerifyBamID contamination checks") except subprocess.CalledProcessError as msg: def allowed_errors(l): return (l.find("Insufficient Available markers") >= 0 or l.find("No reads found in any of the regions") >= 0) if any([allowed_errors(l) for l in str(msg).split("\n")]): logger.info( "Skipping VerifyBamID, not enough overlapping markers found: %s" % dd.get_sample_name(data)) with open(failed_file, "w") as out_handle: out_handle.write(str(msg)) else: logger.warning(str(msg)) # don't escalate, it breaks some terminals on AWS Ubuntu # raise else: # Fix any sample name problems, for pileups shutil.move(tx_out_base + ".selfSM", tx_out_base + ".selfSM.orig") with open(tx_out_base + ".selfSM.orig") as in_handle: with open(tx_out_base + ".selfSM", "w") as out_handle: sample_name = None for line in in_handle: if line.startswith("DefaultSampleName"): line = line.replace("DefaultSampleName", dd.get_sample_name(data)) # work around bug in finding SM from BAM RG at end of line if len(line.strip().split("\t")) == 1: sample_name = line.strip() line = None elif sample_name: parts = line.split("\t") parts[0] = sample_name line = "\t".join(parts) sample_name = None if line: out_handle.write(line) for e in exts + [".selfSM"]: if os.path.exists(tx_out_base + e): shutil.copy(tx_out_base + e, out_base + e)
def run_peddy(samples, out_dir=None): vcf_file = None for d in samples: vcinfo = variant.get_active_vcinfo(d) if vcinfo and vcinfo.get("vrn_file") and utils.file_exists(vcinfo["vrn_file"]): if vcinfo["vrn_file"] and dd.get_sample_name(d) in vcfutils.get_samples(vcinfo["vrn_file"]): vcf_file = vcinfo["vrn_file"] break data = samples[0] peddy = config_utils.get_program("peddy", data) if config_utils.program_installed("peddy", data) else None if not peddy or not vcf_file or not is_human(data): logger.info("peddy is not installed, not human or sample VCFs don't match, skipping correspondence checking " "for %s." % vcf_file) return samples batch = dd.get_batch(data) or dd.get_sample_name(data) if out_dir: peddy_dir = safe_makedir(out_dir) else: peddy_dir = safe_makedir(os.path.join(dd.get_work_dir(data), "qc", batch, "peddy")) ped_file = create_ped_file(samples, vcf_file, out_dir=out_dir) peddy_prefix = os.path.join(peddy_dir, batch) peddy_report = peddy_prefix + ".html" peddyfiles = expected_peddy_files(peddy_report, batch) if file_exists(peddy_report): return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles) if file_exists(peddy_prefix + "-failed.log"): return samples num_cores = dd.get_num_cores(data) with tx_tmpdir(data) as tx_dir: peddy_prefix_tx = os.path.join(tx_dir, os.path.basename(peddy_prefix)) # Redirects stderr because incredibly noisy with no intervals found messages from cyvcf2 stderr_log = os.path.join(tx_dir, "run-stderr.log") cmd = "{peddy} -p {num_cores} --plot --prefix {peddy_prefix_tx} {vcf_file} {ped_file} 2> {stderr_log}" message = "Running peddy on {vcf_file} against {ped_file}." try: do.run(cmd.format(**locals()), message.format(**locals())) except: to_show = collections.deque(maxlen=100) with open(stderr_log) as in_handle: for line in in_handle: to_show.append(line) if any([l.find("IndexError") >=0 and l.find("is out of bounds for axis") >= 0 for l in to_show]): logger.info("Skipping peddy because no variants overlap with checks: %s" % batch) with open(peddy_prefix + "-failed.log", "w") as out_handle: out_handle.write("peddy did not find overlaps with 1kg sites in VCF, skipping") return samples else: logger.warning("".join(to_show)) raise for ext in PEDDY_OUT_EXTENSIONS: if os.path.exists(peddy_prefix_tx + ext): shutil.move(peddy_prefix_tx + ext, peddy_prefix + ext) return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)
def _download_srx(url, out_dir): cmd = "wget -N -r -nH -nd -np -nv {0}".format(url) out_dir = os.path.abspath(utils.safe_makedir(out_dir)) with utils.chdir(out_dir): try: do.run(cmd, "Download %s" % url ) except: logger.warning("Sample path not found in database. Skipping.") traceback.print_exc() return None return [os.path.join(out_dir, fn) for fn in os.listdir(out_dir)]
def _download_srx(srxid, url, out_dir): cmd = "wget -N -r -nH -nd -np -nv {0}".format(url) out_dir = os.path.abspath(utils.safe_makedir(out_dir)) with utils.chdir(out_dir): try: do.run(cmd, "Download %s" % url) except: logger.warning("Sample path not found in database. Skipping.") traceback.print_exc() return None return [os.path.join(out_dir, fn) for fn in os.listdir(out_dir)]
def summary(*samples): """Summarize all quality metrics together""" samples = utils.unpack_worlds(samples) work_dir = dd.get_work_dir(samples[0]) multiqc = config_utils.get_program("multiqc", samples[0]["config"]) if not multiqc: logger.debug("multiqc not found. Update bcbio_nextgen.py tools to fix this issue.") folders = [] opts = "" out_dir = os.path.join(work_dir, "multiqc") out_data = os.path.join(work_dir, "multiqc", "multiqc_data") out_file = os.path.join(out_dir, "multiqc_report.html") samples = _report_summary(samples, os.path.join(out_dir, "report")) for data in samples: for program, pfiles in tz.get_in(["summary", "qc"], data, {}).iteritems(): if isinstance(pfiles, dict): pfiles = pfiles["base"] folders.append(os.path.dirname(pfiles)) # XXX temporary workaround until we can handle larger inputs through MultiQC folders = list(set(folders)) if len(folders) > 250: logger.warning("Too many samples for MultiQC, only using first 250 entries.") folders = folders[:250] opts = "--flat" # Back compatible -- to migrate to explicit specifications in input YAML folders += ["trimmed", "htseq-count/*summary"] if not utils.file_exists(out_file): with utils.chdir(work_dir): input_dir = " ".join([_check_multiqc_input(d) for d in folders]) export_tmp = "" if dd.get_tmp_dir(samples[0]): export_tmp = "export TMPDIR=%s &&" % dd.get_tmp_dir(samples[0]) if input_dir.strip(): cmd = "{export_tmp} {multiqc} -f {input_dir} -o {tx_out} {opts}" with tx_tmpdir(data, work_dir) as tx_out: do.run(cmd.format(**locals()), "Run multiqc") if utils.file_exists(os.path.join(tx_out, "multiqc_report.html")): shutil.move(os.path.join(tx_out, "multiqc_report.html"), out_file) shutil.move(os.path.join(tx_out, "multiqc_data"), out_data) out = [] for i, data in enumerate(samples): if i == 0: if utils.file_exists(out_file): data_files = glob.glob(os.path.join(out_dir, "multiqc_data", "*.txt")) data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.bed")) data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.txt")) data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.tsv")) data_files += glob.glob(os.path.join(out_dir, "report", "*.R*")) if "summary" not in data: data["summary"] = {} data["summary"]["multiqc"] = {"base": out_file, "secondary": data_files} out.append(data) return [[d] for d in out]
def chipqc(bam_file, sample, out_dir): """Attempt code to run ChIPQC bioconductor packate in one sample""" work_dir = dd.get_work_dir(sample) sample_name = dd.get_sample_name(sample) logger.warning("ChIPQC is unstable right now, if it breaks, turn off the tool.") if utils.file_exists(out_dir): return _get_output(out_dir) with tx_tmpdir() as tmp_dir: rcode = _sample_template(sample, tmp_dir) # local_sitelib = utils.R_sitelib() rscript = utils.Rscript_cmd() do.run([rscript, rcode], "ChIPQC in %s" % sample_name, log_error=False) shutil.move(tmp_dir, out_dir) return _get_output(out_dir)
def chipqc(bam_file, sample, out_dir): """Attempt code to run ChIPQC bioconductor packate in one sample""" sample_name = dd.get_sample_name(sample) logger.warning("ChIPQC is unstable right now, if it breaks, turn off the tool.") if utils.file_exists(out_dir): return _get_output(out_dir) with tx_tmpdir() as tmp_dir: rcode = _sample_template(sample, tmp_dir) if rcode: # local_sitelib = utils.R_sitelib() rscript = utils.Rscript_cmd() do.run([rscript, "--no-environ", rcode], "ChIPQC in %s" % sample_name, log_error=False) shutil.move(tmp_dir, out_dir) return _get_output(out_dir)
def _get_env(): conda = os.path.join(os.path.dirname(sys.executable), "conda") anaconda = os.path.join(os.path.dirname(sys.executable), "..") cl = ("{conda} list --json -f seqbuster").format(**locals()) with closing(subprocess.Popen(cl, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True).stdout) as stdout: try: version = stdout.readlines()[2].strip().split()[1] if LooseVersion(version) >= LooseVersion("3"): logger.info("miraligner version %s" % version) return "JAVA_HOME=%s && " % anaconda except: logger.warning("Cannot detect miraligner version, asumming latest.") return ""
def quantitate(data): """CWL target for quantitation. XXX Needs to be split and parallelized by expression caller, with merging of multiple calls. """ data = to_single_data(to_single_data(data)) data = generate_transcript_counts(data)[0][0] data["quant"] = {} if "sailfish" in dd.get_expression_caller(data): data = to_single_data(sailfish.run_sailfish(data)[0]) data["quant"]["tsv"] = data["sailfish"] data["quant"]["hdf5"] = os.path.join(os.path.dirname(data["sailfish"]), "abundance.h5") if ("kallisto" in dd.get_expression_caller(data) or "pizzly" in dd.get_fusion_caller(data, [])): data = to_single_data(kallisto.run_kallisto_rnaseq(data)[0]) data["quant"]["tsv"] = os.path.join(data["kallisto_quant"], "abundance.tsv") data["quant"]["hdf5"] = os.path.join(data["kallisto_quant"], "abundance.h5") if (os.path.exists(os.path.join(data["kallisto_quant"], "fusion.txt"))): data["quant"]["fusion"] = os.path.join(data["kallisto_quant"], "fusion.txt") else: data["quant"]["fusion"] = None if "salmon" in dd.get_expression_caller(data): if dd.get_quantify_genome_alignments(data): if dd.get_aligner(data).lower() != "star": if dd.get_genome_build(data) == "hg38": logger.warning( "Whole genome alignment-based Salmon quantification is " "only supported for the STAR aligner. Since this is hg38 we will fall " "back to the decoy method") data = to_single_data(salmon.run_salmon_decoy(data)[0]) else: logger.warning( "Whole genome alignment-based Salmon quantification is " "only supported for the STAR aligner. Falling back to the " "transcriptome-only method.") data = to_single_data(salmon.run_salmon_reads(data)[0]) else: data = to_single_data(salmon.run_salmon_bam(data)[0]) else: data = to_single_data(salmon.run_salmon_reads(data)[0]) data["quant"]["tsv"] = data["salmon"] data["quant"]["hdf5"] = os.path.join(os.path.dirname(data["salmon"]), "abundance.h5") return [[data]]
def _check_stems(files): """check if stem names are the same and use full path then""" used = set() for fn in files: if os.path.basename(fn) in used: logger.warning("%s stem is multiple times in your file list, " "so we don't know " "how to assign it to the sample data in the CSV. " "We are gonna use full path to make a difference, " "that means paired files should be in the same folder. " "If this is a problem, you should rename the files you want " "to merge. Sorry, no possible magic here." % os.path.basename(fn) ) return True used.add(os.path.basename(fn)) return False
def detect_fusions(data): # support the old style of fusion mode calling if dd.get_fusion_mode(data, False): data = dd.set_fusion_caller(data, ["oncofuse", "pizzly"]) logger.warning("``fusion_mode`` is deprecated in favor of turning on " "callers with ``fusion_caller``. It will run pizzly and " "oncofuse for now, but will eventually have support " "dropped.") if "oncofuse" in dd.get_fusion_caller(data, []): oncofuse_file = oncofuse.run(data) if oncofuse_file: data = dd.set_oncofuse_file(data, oncofuse_file) if "pizzly" in dd.get_fusion_caller(data, []): pizzly_dir = pizzly.run_pizzly(data) if pizzly_dir: data = dd.set_pizzly_dir(data, pizzly_dir) return [[data]]
def call_consensus(samples): """ call consensus peaks on the narrowPeak files from a set of ChiP/ATAC samples """ data = samples[0][0] new_samples = [] consensusdir = os.path.join(dd.get_work_dir(data), "consensus") utils.safe_makedir(consensusdir) peakfiles = [] for data in dd.sample_data_iterator(samples): if dd.get_chip_method(data) == "chip": for fn in tz.get_in(("peaks_files", "macs2"), data, []): if "narrowPeak" in fn: peakfiles.append(fn) elif "broadPeak" in fn: peakfiles.append(fn) elif dd.get_chip_method(data) == "atac": if bam.is_paired(dd.get_work_bam(data)): for fn in tz.get_in(("peaks_files", "NF", "macs2"), data, []): if "narrowPeak" in fn: peakfiles.append(fn) else: logger.info( f"Using peaks from full fraction since {dd.get_work_bam(data)} is single-ended." ) for fn in tz.get_in(("peaks_files", "full", "macs2"), data, []): if "narrowPeak" in fn: peakfiles.append(fn) consensusfile = os.path.join(consensusdir, "consensus.bed") if not peakfiles: logger.info( "No suitable peak files found, skipping consensus peak calling.") return samples consensusfile = consensus(peakfiles, consensusfile, data) if not utils.file_exists(consensusfile): logger.warning("No consensus peaks found.") return samples saffile = consensus_to_saf(consensusfile, os.path.splitext(consensusfile)[0] + ".saf") for data in dd.sample_data_iterator(samples): data = tz.assoc_in(data, ("peaks_files", "consensus"), {"main": consensusfile}) new_samples.append([data]) return new_samples
def _check_stems(files): """check if stem names are the same and use full path then""" used = set() for fn in files: if os.path.basename(fn) in used: logger.warning( "%s stem is multiple times in your file list, " "so we don't know " "how to assign it to the sample data in the CSV. " "We are gonna use full path to make a difference, " "that means paired files should be in the same folder. " "If this is a problem, you should rename the files you want " "to merge. Sorry, no possible magic here." % os.path.basename(fn)) return True used.add(os.path.basename(fn)) return False
def _find_lib(data): """Find mirge libs""" options = " ".join(data.get('resources', {}).get('mirge', {}).get("options", "")) if options.find("-lib") > -1 and utils.file_exists(options.split()[1]): return options if not options: logger.warning("miRge libraries not found. Follow these instructions to install them:") logger.warning("https://github.com/mhalushka/miRge#download-libraries") logger.warning("Then, pass -lib LIB_PATH with resourcces:mirge:options:[...]") logger.warning("More information: https://bcbio-nextgen.readthedocs.io/en/latest/contents/pipelines.html#smallrna-seq")
def _generate_estimates(bam_file, out_base, failed_file, exts, data): background = {"dataset": "1000g.phase3", "nvars": "100k", "build":"b38" if dd.get_genome_build(data) == "hg38" else "b37"} with file_transaction(data, out_base) as tx_out_base: cmd = ["verifybamid2", background["dataset"], background["nvars"], background["build"], "--Reference", dd.get_ref_file(data), "--Output", tx_out_base] cmd += _get_input_args(bam_file, data, out_base, background) try: do.run(cmd, "VerifyBamID contamination checks") except subprocess.CalledProcessError as msg: def allowed_errors(l): return (l.find("Insufficient Available markers") >= 0 or l.find("No reads found in any of the regions") >= 0) if any([allowed_errors(l) for l in str(msg).split("\n")]): logger.info("Skipping VerifyBamID, not enough overlapping markers found: %s" % dd.get_sample_name(data)) with open(failed_file, "w") as out_handle: out_handle.write(str(msg)) else: logger.warning(str(msg)) raise else: # Fix any sample name problems, for pileups shutil.move(tx_out_base + ".selfSM", tx_out_base + ".selfSM.orig") with open(tx_out_base + ".selfSM.orig") as in_handle: with open(tx_out_base + ".selfSM", "w") as out_handle: sample_name = None for line in in_handle: if line.startswith("DefaultSampleName"): line = line.replace("DefaultSampleName", dd.get_sample_name(data)) # work around bug in finding SM from BAM RG at end of line if len(line.strip().split("\t")) == 1: sample_name = line.strip() line = None elif sample_name: parts = line.split("\t") parts[0] = sample_name line = "\t".join(parts) sample_name = None if line: out_handle.write(line) for e in exts + [".selfSM"]: if os.path.exists(tx_out_base + e): shutil.copy(tx_out_base + e, out_base + e)
def _get_env(): conda = os.path.join(os.path.dirname(sys.executable), "conda") anaconda = os.path.join(os.path.dirname(sys.executable), "..") cl = ("{conda} list --json -f seqbuster").format(**locals()) with closing( subprocess.Popen(cl, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True).stdout) as stdout: try: version = stdout.readlines()[2].strip().split()[1] if LooseVersion(version) >= LooseVersion("3"): logger.info("miraligner version %s" % version) return "JAVA_HOME=%s && " % anaconda except: logger.warning( "Cannot detect miraligner version, asumming latest.") return ""
def quantitate_expression_parallel(samples, run_parallel): """ quantitate expression, all programs run here should be multithreaded to take advantage of the threaded run_parallel environment """ data = samples[0][0] to_index = determine_indexes_to_make(samples) samples = run_parallel("generate_transcript_counts", samples) if "cufflinks" in dd.get_expression_caller(data): samples = run_parallel("run_cufflinks", samples) if "stringtie" in dd.get_expression_caller(data): samples = run_parallel("run_stringtie_expression", samples) if ("kallisto" in dd.get_expression_caller(data) or dd.get_fusion_mode(data) or "pizzly" in dd.get_fusion_caller(data, [])): run_parallel("run_kallisto_index", [to_index]) samples = run_parallel("run_kallisto_rnaseq", samples) if "sailfish" in dd.get_expression_caller(data): run_parallel("run_sailfish_index", [to_index]) samples = run_parallel("run_sailfish", samples) # always run salmon run_parallel("run_salmon_index", [to_index]) if dd.get_quantify_genome_alignments(data): if dd.get_aligner(data).lower() != "star": if dd.get_genome_build(data) == "hg38": logger.warning( "Whole genome alignment-based Salmon quantification is " "only supported for the STAR aligner. Since this is hg38 we will fall " "back to the decoy method") samples = run_parallel("run_salmon_decoy", samples) else: logger.warning( "Whole genome alignment-based Salmon quantification is " "only supported for the STAR aligner. Falling back to the " "transcriptome-only method.") samples = run_parallel("run_salmon_reads", samples) else: samples = run_parallel("run_salmon_bam", samples) else: samples = run_parallel("run_salmon_reads", samples) samples = run_parallel("detect_fusions", samples) return samples
def run(data): if not aligner_supports_fusion(data): aligner = dd.get_aligner(data) logger.warning("Oncofuse is not supported for the %s aligner, " "skipping. " % aligner) return None config = data["config"] genome_build = data.get("genome_build", "") input_type, input_dir, input_file = _get_input_para(data) if genome_build == "GRCh37": # assume genome_build is hg19 otherwise if config["algorithm"].get("aligner") in ["star"]: if file_exists(input_file): input_file = _fix_star_junction_output(input_file) if config["algorithm"].get("aligner") in ["tophat", "tophat2"]: if file_exists(input_file): input_file = _fix_tophat_junction_output(input_file) elif "hg19" not in genome_build: return None #handle cases when fusion file doesn't exist if not file_exists(input_file): return None out_file = os.path.join(input_dir, "oncofuse_out.txt") if file_exists(out_file): return out_file oncofuse = config_utils.get_program("oncofuse", config) tissue_type = _oncofuse_tissue_arg_from_config(data) resources = config_utils.get_resources("oncofuse", config) if not file_exists(out_file): cl = [oncofuse] cl += resources.get("jvm_opts", ["-Xms750m", "-Xmx5g"]) with file_transaction(data, out_file) as tx_out_file: cl += [input_file, input_type, tissue_type, tx_out_file] cmd = " ".join(cl) try: do.run(cmd, "oncofuse fusion detection", data) except: do.run( "touch %s && echo '# failed' >> %s" % (tx_out_file, tx_out_file), "oncofuse failed", data) #return out_file return out_file
def summary(samples, config): """Provide summary information on a single sample across regions of interest. """ try: bc_jar = config_utils.get_jar("bcbio.coverage", config_utils.get_program("bcbio_coverage", config, "dir")) except ValueError: logger.warning("No coverage calculations: Did not find bcbio.coverage jar from system config") return [[x] for x in samples] config_file, out_file = _prep_coverage_config(samples, config) tmp_dir = utils.safe_makedir(os.path.join(os.path.dirname(out_file), "tmp")) resources = config_utils.get_resources("bcbio_coverage", config) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"]) java_args = ["-Djava.io.tmpdir=%s" % tmp_dir] cmd = ["java"] + jvm_opts + java_args + ["-jar", bc_jar, "multicompare", config_file, out_file, "-c", str(config["algorithm"]["num_cores"])] do.run(cmd, "Summarizing coverage with bcbio.coverage", samples[0]) out = [] for x in samples: x["coverage"] = {"summary": out_file} out.append([x]) return out
def run(data): if not aligner_supports_fusion(data): aligner = dd.get_aligner(data) logger.warning("Oncofuse is not supported for the %s aligner, " "skipping. " % aligner) return None config = data["config"] genome_build = data.get("genome_build", "") input_type, input_dir, input_file = _get_input_para(data) if genome_build == "GRCh37": # assume genome_build is hg19 otherwise if config["algorithm"].get("aligner") in ["star"]: if file_exists(input_file): input_file = _fix_star_junction_output(input_file) if config["algorithm"].get("aligner") in ["tophat", "tophat2"]: if file_exists(input_file): input_file = _fix_tophat_junction_output(input_file) elif "hg19" not in genome_build: return None #handle cases when fusion file doesn't exist if not file_exists(input_file): return None out_file = os.path.join(input_dir, "oncofuse_out.txt") if file_exists(out_file): return out_file oncofuse = config_utils.get_program("oncofuse", config) tissue_type = _oncofuse_tissue_arg_from_config(data) resources = config_utils.get_resources("oncofuse", config) if not file_exists(out_file): cl = [oncofuse] cl += resources.get("jvm_opts", ["-Xms750m", "-Xmx5g"]) with file_transaction(data, out_file) as tx_out_file: cl += [input_file, input_type, tissue_type, tx_out_file] cmd = " ".join(cl) try: do.run(cmd, "oncofuse fusion detection", data) except: do.run("touch %s && echo '# failed' >> %s" % (tx_out_file, tx_out_file), "oncofuse failed", data) #return out_file return out_file
def summary(samples, config): """Provide summary information on a single sample across regions of interest. """ try: bc_jar = config_utils.get_jar( "bcbio.coverage", config_utils.get_program("bcbio_coverage", config, "dir")) except ValueError: logger.warning( "No coverage calculations: Did not find bcbio.coverage jar from system config" ) return [[x] for x in samples] config_file, out_file = _prep_coverage_config(samples, config) tmp_dir = utils.safe_makedir(os.path.join(os.path.dirname(out_file), "tmp")) resources = config_utils.get_resources("bcbio_coverage", config) config = copy.deepcopy(config) config["algorithm"]["memory_adjust"] = { "direction": "increase", "magnitude": config["algorithm"].get("num_cores", 1) } jvm_opts = config_utils.adjust_opts( resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"]), config) if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: java_args = [ "-Djava.io.tmpdir=%s" % tmp_dir, "-Djava.awt.headless=true" ] cmd = ["java"] + jvm_opts + java_args + [ "-jar", bc_jar, "multicompare", config_file, tx_out_file, "-c", str(config["algorithm"].get("num_cores", 1)) ] do.run(cmd, "Summarizing coverage with bcbio.coverage", samples[0]) out = [] for x in samples: x["coverage"] = {"summary": out_file} out.append([x]) return out
def detect_fusions(data): data = to_single_data(data) # support the old style of fusion mode calling if dd.get_fusion_mode(data, False): data = dd.set_fusion_caller(data, ["oncofuse", "pizzly"]) logger.warning("``fusion_mode`` is deprecated in favor of turning on " "callers with ``fusion_caller``. It will run pizzly and " "oncofuse for now, but will eventually have support " "dropped.") fusion_caller = dd.get_fusion_caller(data, []) if "oncofuse" in fusion_caller: oncofuse_file = oncofuse.run(data) if oncofuse_file: data = dd.set_oncofuse_file(data, oncofuse_file) if "pizzly" in fusion_caller: pizzly_dir = pizzly.run_pizzly(data) if pizzly_dir: data = dd.set_pizzly_dir(data, pizzly_dir) data["fusion"] = {"fasta": os.path.join(pizzly_dir, "%s.fusions.fasta" % dd.get_sample_name(data)), "json": os.path.join(pizzly_dir, "%s.json" % dd.get_sample_name(data))} if "ericscript" in fusion_caller: ericscript_dir = ericscript.run(data) return [[data]]
def _find_lib(data): """Find mirge libs""" options = " ".join( data.get('resources', {}).get('mirge', {}).get("options", "")) if options.find("-lib") > -1 and utils.file_exists(options.split()[1]): return options if not options: logger.warning( "miRge libraries not found. Follow these instructions to install them:" ) logger.warning("https://github.com/mhalushka/miRge#download-libraries") logger.warning( "Then, pass -lib LIB_PATH with resourcces:mirge:options:[...]") logger.warning( "More information: https://bcbio-nextgen.readthedocs.io/en/latest/contents/pipelines.html#smallrna-seq" )
def run(data): config = data[0][0]['config'] work_dir = dd.get_work_dir(data[0][0]) genome = dd.get_ref_file(data[0][0]) mirdeep2 = os.path.join(os.path.dirname(sys.executable), "miRDeep2.pl") perl_exports = get_perl_exports() hairpin, mature, species = "none", "none", "na" rfam_file = dd.get_mirdeep2_file(data[0][0]) if file_exists(dd.get_mirbase_hairpin(data[0][0])): species = dd.get_species(data[0][0]) hairpin = dd.get_mirbase_hairpin(data[0][0]) mature = dd.get_mirbase_mature(data[0][0]) logger.debug("Preparing for mirdeep2 analysis.") bam_file = op.join(work_dir, "align", "seqs.bam") seqs_dir = op.join(work_dir, "seqcluster", "prepare") collapsed = op.join(seqs_dir, "seqs.ma") out_dir = op.join(work_dir, "mirdeep2") out_file = op.join(out_dir, "result_res.csv") safe_makedir(out_dir) with chdir(out_dir): collapsed, bam_file = _prepare_inputs(collapsed, bam_file, out_dir) cmd = ( "{perl_exports} && perl {mirdeep2} {collapsed} {genome} {bam_file} {mature} none {hairpin} -f {rfam_file} -r simple -c -P -t {species} -z res" ).format(**locals()) if file_exists(mirdeep2) and not file_exists(out_file) and file_exists( rfam_file): try: do.run(cmd.format(**locals()), "Running mirdeep2.") except: logger.warning( "mirdeep2 failed. Please report the error to https://github.com/lpantano/mirdeep2_core/issues." ) if file_exists(out_file): novel_db = _parse_novel(out_file, dd.get_species(data[0][0])) return novel_db
def clean_chipseq_alignment(data): # lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr")) work_bam = dd.get_work_bam(data) clean_bam = remove_nonassembled_chrom(work_bam, data) if not dd.get_keep_multimapped(data): clean_bam = remove_multimappers(clean_bam, data) if not dd.get_keep_duplicates(data): clean_bam = bam.remove_duplicates(clean_bam, data) data["work_bam"] = clean_bam encode_bed = tz.get_in( ["genome_resources", "variation", "encode_blacklist"], data) if encode_bed: data["work_bam"] = remove_blacklist_regions(dd.get_work_bam(data), encode_bed, data['config']) bam.index(data["work_bam"], data['config']) try: data["bigwig"] = _normalized_bam_coverage(dd.get_sample_name(data), dd.get_work_bam(data), data) except subprocess.CalledProcessError: logger.warning(f"{dd.get_work_bam(data)} was too sparse to normalize, " f" falling back to non-normalized coverage.") data["bigwig"] = _bam_coverage(dd.get_sample_name(data), dd.get_work_bam(data), data) return [[data]]
def clean_chipseq_alignment(data): # lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr")) method = dd.get_chip_method(data) if method == "atac": data = shift_ATAC(data) work_bam = dd.get_work_bam(data) work_bam = bam.sort(work_bam, dd.get_config(data)) bam.index(work_bam, dd.get_config(data)) # an unfiltered BAM file is useful for calculating some metrics later data = tz.assoc_in(data, ['chipseq', 'align', "unfiltered"], work_bam) clean_bam = remove_nonassembled_chrom(work_bam, data) clean_bam = remove_mitochondrial_reads(clean_bam, data) data = atac.calculate_complexity_metrics(clean_bam, data) if not dd.get_keep_multimapped(data): clean_bam = remove_multimappers(clean_bam, data) if not dd.get_keep_duplicates(data): clean_bam = bam.remove_duplicates(clean_bam, data) data["work_bam"] = clean_bam # for ATAC-seq, brewak alignments into NF, mono/di/tri nucleosome BAM files if method == "atac": data = atac.split_ATAC(data) encode_bed = tz.get_in( ["genome_resources", "variation", "encode_blacklist"], data) if encode_bed: data["work_bam"] = remove_blacklist_regions(dd.get_work_bam(data), encode_bed, data['config']) bam.index(data["work_bam"], data['config']) try: data["bigwig"] = _normalized_bam_coverage(dd.get_sample_name(data), dd.get_work_bam(data), data) except subprocess.CalledProcessError: logger.warning(f"{dd.get_work_bam(data)} was too sparse to normalize, " f" falling back to non-normalized coverage.") data["bigwig"] = _bam_coverage(dd.get_sample_name(data), dd.get_work_bam(data), data) return [[data]]
def _check_java_version(config, items): msg = java(config, items) if msg: logger.warning("miraligner is only compatible with java 1.7") return False return True
def combine_pairs(input_files, force_single=False, full_name=False, separators=None): """ calls files pairs if they are completely the same except for one has _1 and the other has _2 returns a list of tuples of pairs or singles. From bipy.utils (https://github.com/roryk/bipy/blob/master/bipy/utils.py) Adjusted to allow different input paths or extensions for matching files. """ PAIR_FILE_IDENTIFIERS = set(["1", "2", "3", "4"]) pairs = [] used = set([]) used_separators = set([]) separators = separators if separators else ("R", "_", "-", ".") for in_file in input_files: matches = set([]) if in_file in used: continue if not force_single: for comp_file in input_files: if comp_file in used or comp_file == in_file: continue if full_name: in_file_name = in_file comp_file_name = comp_file else: in_file_name = os.path.basename(in_file) comp_file_name = os.path.basename(comp_file) a = rstrip_extra(utils.splitext_plus(in_file_name)[0]) b = rstrip_extra(utils.splitext_plus(comp_file_name)[0]) if len(a) != len(b): continue s = dif(a,b) # no differences, then its the same file stem if len(s) == 0: logger.error("%s and %s have the same stem, so we don't know " "how to assign it to the sample data in the CSV. To " "get around this you can rename one of the files. " "If they are meant to be the same sample run in two " "lanes, combine them first with the " "bcbio_prepare_samples.py script." "(http://bcbio-nextgen.readthedocs.io/en/latest/contents/configuration.html#multiple-files-per-sample)" % (in_file, comp_file)) # continue sys.exit(1) if len(s) > 1: continue #there is more than 1 difference if (a[s[0]] in PAIR_FILE_IDENTIFIERS and b[s[0]] in PAIR_FILE_IDENTIFIERS): # if the 1/2 isn't the last digit before a separator, skip # this skips stuff like 2P 2A, often denoting replicates, not # read pairings if len(b) > (s[0] + 1): if (b[s[0]+1] not in ("_", "-", ".")): continue # if the 1/2 is not a separator or prefaced with R, skip if b[s[0] - 1] in separators: used_separators.add(b[s[0] - 1]) if len(used_separators) > 1: logger.warning("To split into paired reads multiple separators were used: %s" % used_separators) logger.warning("This can lead to wrong assignation.") logger.warning("Use --separator option in bcbio_prepare_samples.py to specify only one.") logger.warning("For instance, --separator R.") matches.update([in_file, comp_file]) used.update([in_file, comp_file]) if matches: pairs.append(sort_filenames(list(matches))) if in_file not in used: pairs.append([in_file]) used.add(in_file) return pairs
def combine_pairs(input_files, force_single=False, full_name=False, separators=None): """ calls files pairs if they are completely the same except for one has _1 and the other has _2 returns a list of tuples of pairs or singles. From bipy.utils (https://github.com/roryk/bipy/blob/master/bipy/utils.py) Adjusted to allow different input paths or extensions for matching files. """ PAIR_FILE_IDENTIFIERS = set(["1", "2", "3", "4"]) pairs = [] used = set([]) used_separators = set([]) separators = separators if separators else ("R", "_", "-", ".") for in_file in input_files: matches = set([]) if in_file in used: continue if not force_single: for comp_file in input_files: if comp_file in used or comp_file == in_file: continue if full_name: in_file_name = in_file comp_file_name = comp_file else: in_file_name = os.path.basename(in_file) comp_file_name = os.path.basename(comp_file) a = rstrip_extra(utils.splitext_plus(in_file_name)[0]) b = rstrip_extra(utils.splitext_plus(comp_file_name)[0]) if len(a) != len(b): continue s = dif(a, b) # no differences, then its the same file stem if len(s) == 0: logger.error( "%s and %s have the same stem, so we don't know " "how to assign it to the sample data in the CSV. To " "get around this you can rename one of the files. " "If they are meant to be the same sample run in two " "lanes, combine them first with the " "bcbio_prepare_samples.py script." "(http://bcbio-nextgen.readthedocs.io/en/latest/contents/configuration.html#multiple-files-per-sample)" % (in_file, comp_file)) # continue sys.exit(1) if len(s) > 1: continue #there is more than 1 difference if (a[s[0]] in PAIR_FILE_IDENTIFIERS and b[s[0]] in PAIR_FILE_IDENTIFIERS): # if the 1/2 isn't the last digit before a separator, skip # this skips stuff like 2P 2A, often denoting replicates, not # read pairings if len(b) > (s[0] + 1): if (b[s[0] + 1] not in ("_", "-", ".")): continue # if the 1/2 is not a separator or prefaced with R, skip if b[s[0] - 1] in separators: used_separators.add(b[s[0] - 1]) if len(used_separators) > 1: logger.warning( "To split into paired reads multiple separators were used: %s" % used_separators) logger.warning( "This can lead to wrong assignation.") logger.warning( "Use --separator option in bcbio_prepare_samples.py to specify only one." ) logger.warning("For instance, --separator R.") matches.update([in_file, comp_file]) used.update([in_file, comp_file]) if matches: pairs.append(sort_filenames(list(matches))) if in_file not in used: pairs.append([in_file]) used.add(in_file) return pairs
def run_peddy(samples, out_dir=None): data = samples[0] batch = dd.get_batch(data) or dd.get_sample_name(data) if isinstance(batch, (list, tuple)): batch = batch[0] if out_dir: peddy_dir = safe_makedir(out_dir) else: peddy_dir = safe_makedir( os.path.join(dd.get_work_dir(data), "qc", batch, "peddy")) peddy_prefix = os.path.join(peddy_dir, batch) peddy_report = peddy_prefix + ".html" vcf_file = None for d in samples: vcinfo = None if dd.get_phenotype(d) == "germline" or dd.get_phenotype(d) not in [ "tumor" ]: vcinfo = variant.get_active_vcinfo(d, use_ensemble=False) if not vcinfo and dd.get_phenotype(d) in ["tumor"]: vcinfo = variant.extract_germline_vcinfo(d, peddy_dir) if vcinfo: for key in ["germline", "vrn_file"]: if vcinfo and vcinfo.get(key) and utils.file_exists( vcinfo[key]): if vcinfo[key] and dd.get_sample_name( d) in vcfutils.get_samples(vcinfo[key]): if vcinfo[ key] and vcfutils.vcf_has_nonfiltered_variants( vcinfo[key]): vcf_file = vcinfo[key] break peddy = config_utils.get_program("peddy", data) if config_utils.program_installed( "peddy", data) else None config_skips = any(["peddy" in dd.get_tools_off(d) for d in samples]) if not peddy or not vcf_file or not vcfanno.is_human(data) or config_skips: if not peddy: reason = "peddy executable not found" elif config_skips: reason = "peddy in tools_off configuration" elif not vcfanno.is_human(data): reason = "sample is not human" else: assert not vcf_file reason = "no suitable VCF files found with the sample and non-filtered variants" msg = "Skipping peddy QC, %s: %s" % ( reason, [dd.get_sample_name(d) for d in samples]) with open(peddy_prefix + "-failed.log", "w") as out_handle: out_handle.write(msg) logger.info(msg) return samples if file_exists(peddy_prefix + "-failed.log"): return samples if not file_exists(peddy_report): ped_file = create_ped_file(samples, vcf_file, out_dir=out_dir) num_cores = dd.get_num_cores(data) with tx_tmpdir(data) as tx_dir: peddy_prefix_tx = os.path.join(tx_dir, os.path.basename(peddy_prefix)) # Redirects stderr because incredibly noisy with no intervals found messages from cyvcf2 stderr_log = os.path.join(tx_dir, "run-stderr.log") sites_str = "--sites hg38" if dd.get_genome_build( data) == "hg38" else "" locale = utils.locale_export() cmd = ( "{locale} {peddy} -p {num_cores} {sites_str} --plot --prefix {peddy_prefix_tx} " "{vcf_file} {ped_file} 2> {stderr_log}") message = "Running peddy on {vcf_file} against {ped_file}." try: do.run(cmd.format(**locals()), message.format(**locals())) except: to_show = collections.deque(maxlen=100) with open(stderr_log) as in_handle: for line in in_handle: to_show.append(line) def allowed_errors(l): return ( (l.find("IndexError") >= 0 and l.find("is out of bounds for axis") >= 0) or (l.find("n_components=") >= 0 and l.find("must be between 1 and n_features=") >= 0) or (l.find("n_components=") >= 0 and l.find("must be between 1 and min") >= 0) or (l.find( "Input contains NaN, infinity or a value too large for dtype" ) >= 0)) def all_line_errors(l): return (l.find("no intervals found for") >= 0) if any([allowed_errors(l) for l in to_show]) or all( [all_line_errors(l) for l in to_show]): logger.info( "Skipping peddy because no variants overlap with checks: %s" % batch) with open(peddy_prefix + "-failed.log", "w") as out_handle: out_handle.write( "peddy did not find overlaps with 1kg sites in VCF, skipping" ) return samples else: logger.warning("".join(to_show)) raise for ext in PEDDY_OUT_EXTENSIONS: if os.path.exists(peddy_prefix_tx + ext): shutil.move(peddy_prefix_tx + ext, peddy_prefix + ext) peddyfiles = expected_peddy_files(peddy_report, batch) return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)