def _do_run(paired): """Perform Battenberg caling with the paired dataset. This purposely does not use a temporary directory for the output since Battenberg does smart restarts. """ work_dir = _sv_workdir(paired.tumor_data) ignore_file = os.path.join(work_dir, "ignore_chromosomes.txt") out = _get_battenberg_out(paired, work_dir) if len(_missing_files(out)) > 0: ref_file = dd.get_ref_file(paired.tumor_data) bat_datadir = os.path.normpath(os.path.join(os.path.dirname(ref_file), os.pardir, "battenberg")) ignore_file = _make_ignore_file(work_dir, ref_file, os.path.join(bat_datadir, "impute", "impute_info.txt"), ignore_file) local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") perl_exports = utils.get_perl_exports() tumor_bam = paired.tumor_bam normal_bam = paired.normal_bam platform = dd.get_platform(paired.tumor_data) genome_build = paired.tumor_data["genome_build"] # scale cores to avoid over-using memory during imputation cores = max(1, int(dd.get_num_cores(paired.tumor_data) * 0.5)) cmd = ("export R_LIBS_USER={local_sitelib} && " "{perl_exports} && " "battenberg.pl -t {cores} -o {work_dir} -r {ref_file}.fai " "-tb {tumor_bam} -nb {normal_bam} -e {bat_datadir}/impute/impute_info.txt " "-u {bat_datadir}/1000genomesloci -c {bat_datadir}/probloci.txt " "-ig {ignore_file} " "-assembly {genome_build} -species Human -platform {platform}") do.run(cmd.format(**locals()), "Battenberg CNV calling") assert len(_missing_files(out)) == 0, "Missing Battenberg output: %s" % _missing_files(out) out["ignore"] = ignore_file return out
def run(data): config = data[0][0]['config'] work_dir = dd.get_work_dir(data[0][0]) genome = dd.get_ref_file(data[0][0]) mirdeep2 = os.path.join(os.path.dirname(sys.executable), "miRDeep2.pl") perl_exports = get_perl_exports() hairpin, mature, species = "none", "none", "na" rfam_file = dd.get_mirdeep2_file(data[0][0]) if file_exists(dd.get_mirbase_hairpin(data[0][0])): species = dd.get_species(data[0][0]) hairpin = dd.get_mirbase_hairpin(data[0][0]) mature = dd.get_mirbase_mature(data[0][0]) logger.debug("Preparing for mirdeep2 analysis.") bam_file = op.join(work_dir, "align", "seqs.bam") seqs_dir = op.join(work_dir, "seqcluster", "prepare") collapsed = op.join(seqs_dir, "seqs.ma") out_dir = op.join(work_dir, "mirdeep2") out_file = op.join(out_dir, "result_res.csv") safe_makedir(out_dir) with chdir(out_dir): collapsed, bam_file = _prepare_inputs(collapsed, bam_file, out_dir) cmd = ("{perl_exports} && perl {mirdeep2} {collapsed} {genome} {bam_file} {mature} none {hairpin} -f {rfam_file} -r simple -c -P -t {species} -z res").format(**locals()) if file_exists(mirdeep2) and not file_exists(out_file) and file_exists(rfam_file): try: do.run(cmd.format(**locals()), "Running mirdeep2.") except: logger.warning("mirdeep2 failed. Please report the error to https://github.com/lpantano/mirdeep2_core/issues.") if file_exists(out_file): novel_db = _parse_novel(out_file, dd.get_species(data[0][0])) return novel_db
def _run_scalpel_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect indels with Scalpel. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: paired = get_paired_bams(align_bams, items) if not paired.normal_bam: ann_file = _run_scalpel_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vcfstreamsort = config_utils.get_program("vcfstreamsort", config) perl_exports = utils.get_perl_exports(os.path.dirname(tx_out_file)) tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0] db_file = os.path.join(tmp_path, "main", "somatic.db") if not os.path.exists(db_file + ".dir"): if os.path.exists(tmp_path): utils.remove_safe(tmp_path) opts = " ".join(_scalpel_options_from_config(items, config, out_file, region, tmp_path)) opts += " --ref {}".format(ref_file) opts += " --dir %s" % tmp_path # caling cl = ("{perl_exports} && " "scalpel-discovery --somatic {opts} --tumor {paired.tumor_bam} --normal {paired.normal_bam}") do.run(cl.format(**locals()), "Genotyping paired variants with Scalpel", {}) # filtering to adjust input parameters bed_opts = " ".join(_scalpel_bed_file_opts(items, config, out_file, region, tmp_path)) use_defaults = True if use_defaults: scalpel_tmp_file = os.path.join(tmp_path, "main/somatic.indel.vcf") # Uses default filters but can tweak min-alt-count-tumor and min-phred-fisher # to swap precision for sensitivity else: scalpel_tmp_file = os.path.join(tmp_path, "main/somatic-indel-filter.vcf.gz") with file_transaction(config, scalpel_tmp_file) as tx_indel_file: cmd = ("{perl_exports} && " "scalpel-export --somatic {bed_opts} --ref {ref_file} --db {db_file} " "--min-alt-count-tumor 5 --min-phred-fisher 10 --min-vaf-tumor 0.1 " "| bgzip -c > {tx_indel_file}") do.run(cmd.format(**locals()), "Scalpel somatic indel filter", {}) scalpel_tmp_file = bgzip_and_index(scalpel_tmp_file, config) scalpel_tmp_file_common = bgzip_and_index(os.path.join(tmp_path, "main/common.indel.vcf"), config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression("chi2", config) bcftools_cmd_common = get_scalpel_bcftools_filter_expression("reject", config) fix_ambig = vcfutils.fix_ambiguous_cl() cl2 = ("vcfcat <({bcftools_cmd_chi2} {scalpel_tmp_file}) " "<({bcftools_cmd_common} {scalpel_tmp_file_common}) | " " {fix_ambig} | {vcfstreamsort} {compress_cmd} > {tx_out_file}") do.run(cl2.format(**locals()), "Finalising Scalpel variants", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _mint_trna_annotation(data): """ use MINTmap to quantify tRNAs """ trna_lookup = op.join(dd.get_srna_mint_lookup(data)) trna_space = op.join(dd.get_srna_mint_space(data)) trna_other = op.join(dd.get_srna_mint_other(data)) name = dd.get_sample_name(data) work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "trna_mint", name)) in_file = op.basename(data["clean_fastq"]) mintmap = os.path.realpath(os.path.join(os.path.dirname(sys.executable), "MINTmap.pl")) perl_export = utils.get_perl_exports() if not file_exists(trna_lookup) or not file_exists(mintmap): logger.info("There is no tRNA annotation to run MINTmap.") return work_dir jar_folder = os.path.join(os.path.dirname(mintmap), "MINTplates") out_file = op.join(work_dir, name + "-MINTmap_v1-exclusive-tRFs.expression.txt") if not file_exists(out_file): with tx_tmpdir(data) as txdir: with utils.chdir(txdir): utils.symlink_plus(data["clean_fastq"], op.join(txdir, in_file)) cmd = ("{perl_export} && {mintmap} -f {in_file} -p {name} " "-l {trna_lookup} -s {trna_space} -j {jar_folder} " "-o {trna_other}").format(**locals()) do.run(cmd, "tRNA for %s" % name) for filename in glob.glob("*MINTmap*"): shutil.move(filename, work_dir) return work_dir
def _trna_annotation(data): """ use tDRmapper to quantify tRNAs """ trna_ref = op.join(dd.get_srna_trna_file(data)) name = dd.get_sample_name(data) work_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "trna", name)) in_file = op.basename(data["clean_fastq"]) tdrmapper = os.path.join(os.path.dirname(sys.executable), "TdrMappingScripts.pl") perl_export = utils.get_perl_exports() if not file_exists(trna_ref) or not file_exists(tdrmapper): logger.info("There is no tRNA annotation to run TdrMapper.") return work_dir out_file = op.join(work_dir, in_file + ".hq_cs.mapped") if not file_exists(out_file): with tx_tmpdir(data) as txdir: with utils.chdir(txdir): utils.symlink_plus(data["clean_fastq"], op.join(txdir, in_file)) cmd = ("{perl_export} && perl {tdrmapper} {trna_ref} {in_file}" ).format(**locals()) do.run(cmd, "tRNA for %s" % name) for filename in glob.glob("*mapped*"): shutil.move(filename, work_dir) return work_dir
def run(data): config = data[0][0]['config'] work_dir = dd.get_work_dir(data[0][0]) genome = dd.get_ref_file(data[0][0]) mirdeep2 = os.path.join(os.path.dirname(sys.executable), "miRDeep2.pl") perl_exports = get_perl_exports() mirbase = op.abspath(op.dirname(dd.get_mirbase_ref(data[0][0]))) species = dd.get_species(data[0][0]) hairpin = op.join(mirbase, "hairpin.fa") mature = op.join(mirbase, "mature.fa") rfam_file = op.join(mirbase, "Rfam_for_miRDeep.fa") bam_file = op.join(work_dir, "align", "seqs.bam") seqs_dir = op.join(work_dir, "seqcluster", "prepare") collapsed = op.join(seqs_dir, "seqs.ma") out_dir = op.join(work_dir, "mirdeep2") out_file = op.join(out_dir, "result_res.csv") safe_makedir(out_dir) with chdir(out_dir): collapsed, bam_file = _prepare_inputs(collapsed, bam_file, out_dir) cmd = ("{perl_exports} && {mirdeep2} {collapsed} {genome} {bam_file} {mature} none {hairpin} -f {rfam_file} -r simple -c -d -P -t {species} -z res").format(**locals()) if file_exists(mirdeep2) and not file_exists(out_file) and file_exists(mature) and file_exists(rfam_file): do.run(cmd.format(**locals()), "Running mirdeep2.") if file_exists(out_file): novel_db = _parse_novel(out_file, dd.get_species(data[0][0])) return novel_db
def run_vep(in_file, data): """Annotate input VCF file with Ensembl variant effect predictor. """ if not vcfutils.vcf_has_variants(in_file): return None out_file = utils.append_stem(in_file, "-vepeffects") assert in_file.endswith(".gz") and out_file.endswith(".gz") if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: vep_dir, ensembl_name = prep_vep_cache(data["genome_build"], tz.get_in(["reference", "fasta", "base"], data)) if vep_dir: cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1) fork_args = ["--fork", str(cores)] if cores > 1 else [] vep = config_utils.get_program("vep", data["config"]) is_human = tz.get_in(["genome_resources", "aliases", "human"], data, False) # HGVS requires a bgzip compressed, faidx indexed input file or is unusable slow if dd.get_ref_file_compressed(data): hgvs_compatible = True config_args = ["--fasta", dd.get_ref_file_compressed(data)] else: hgvs_compatible = False config_args = ["--fasta", dd.get_ref_file(data)] if is_human: plugin_fns = {"loftee": _get_loftee, "maxentscan": _get_maxentscan, "genesplicer": _get_genesplicer, "spliceregion": _get_spliceregion} plugins = ["loftee"] if "vep_splicesite_annotations" in dd.get_tools_on(data): # "genesplicer" too unstable so currently removed plugins += ["maxentscan", "spliceregion"] for plugin in plugins: plugin_args = plugin_fns[plugin](data) config_args += plugin_args config_args += ["--sift", "b", "--polyphen", "b"] if hgvs_compatible: config_args += ["--hgvs", "--shift_hgvs", "1"] if (dd.get_effects_transcripts(data).startswith("canonical") or tz.get_in(("config", "algorithm", "clinical_reporting"), data)): config_args += ["--pick_allele"] if ensembl_name.endswith("_merged"): config_args += ["--merged"] ensembl_name = ensembl_name.replace("_merged", "") resources = config_utils.get_resources("vep", data["config"]) extra_args = [str(x) for x in resources.get("options", [])] cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \ ["--species", ensembl_name, "--no_stats", "--cache", "--offline", "--dir", vep_dir, "--symbol", "--numbers", "--biotype", "--total_length", "--canonical", "--gene_phenotype", "--ccds", "--uniprot", "--domains", "--regulatory", "--protein", "--tsl", "--appris", "--af", "--max_af", "--af_1kg", "--af_esp", "--af_gnomad", "--pubmed", "--variant_class", "--allele_number"] + config_args perl_exports = utils.get_perl_exports() # Remove empty fields (';;') which can cause parsing errors downstream cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (perl_exports, " ".join(cmd), tx_out_file) do.run(cmd, "Ensembl variant effect predictor", data) if utils.file_exists(out_file): vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def run_vep(in_file, data): """Annotate input VCF file with Ensembl variant effect predictor. """ if not vcfutils.vcf_has_variants(in_file): return None out_file = utils.append_stem(in_file, "-vepeffects") assert in_file.endswith(".gz") and out_file.endswith(".gz") if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: vep_dir, ensembl_name = prep_vep_cache(data["genome_build"], tz.get_in(["reference", "fasta", "base"], data)) if vep_dir: cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1) fork_args = ["--fork", str(cores)] if cores > 1 else [] vep = config_utils.get_program("vep", data["config"]) is_human = tz.get_in(["genome_resources", "aliases", "human"], data, False) # HGVS requires a bgzip compressed, faidx indexed input file or is unusable slow if dd.get_ref_file_compressed(data): hgvs_compatible = True config_args = ["--fasta", dd.get_ref_file_compressed(data)] else: hgvs_compatible = False config_args = ["--fasta", dd.get_ref_file(data)] if is_human: plugin_fns = {"loftee": _get_loftee, "maxentscan": _get_maxentscan, "genesplicer": _get_genesplicer, "spliceregion": _get_spliceregion} plugins = ["loftee"] if "vep_splicesite_annotations" in dd.get_tools_on(data): # "genesplicer" too unstable so currently removed plugins += ["maxentscan", "spliceregion"] for plugin in plugins: plugin_args = plugin_fns[plugin](data) config_args += plugin_args config_args += ["--sift", "b", "--polyphen", "b"] if hgvs_compatible: config_args += ["--hgvs", "--shift_hgvs", "1"] if (dd.get_effects_transcripts(data).startswith("canonical") or tz.get_in(("config", "algorithm", "clinical_reporting"), data)): config_args += ["--pick_allele"] if ensembl_name.endswith("_merged"): config_args += ["--merged"] ensembl_name = ensembl_name.replace("_merged", "") resources = config_utils.get_resources("vep", data["config"]) extra_args = [str(x) for x in resources.get("options", [])] cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \ ["--species", ensembl_name, "--no_stats", "--cache", "--offline", "--dir", vep_dir, "--symbol", "--numbers", "--biotype", "--total_length", "--canonical", "--gene_phenotype", "--ccds", "--uniprot", "--domains", "--regulatory", "--protein", "--tsl", "--appris", "--af", "--max_af", "--af_1kg", "--af_esp", "--af_gnomad", "--pubmed", "--variant_class", "--allele_number"] + config_args perl_exports = utils.get_perl_exports() # Remove empty fields (';;') which can cause parsing errors downstream cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (perl_exports, " ".join(cmd), tx_out_file) do.run(cmd, "Ensembl variant effect predictor", data) if utils.file_exists(out_file): return vcfutils.bgzip_and_index(out_file, data["config"])
def prep_vep_cache(dbkey, ref_file, tooldir=None, config=None): """Ensure correct installation of VEP cache file. """ if config is None: config = {} resource_file = os.path.join(os.path.dirname(ref_file), "%s-resources.yaml" % dbkey) if os.path.exists(resource_file): with open(resource_file) as in_handle: resources = yaml.load(in_handle) ensembl_name = tz.get_in(["aliases", "ensembl"], resources) symlink_dir = _special_dbkey_maps(dbkey, ref_file) if ensembl_name and ensembl_name.find("_vep_") == -1: raise ValueError("%s has ensembl an incorrect value." "It should have _vep_ in the name." "Remove line or fix the name to avoid error.") if symlink_dir and ensembl_name: species, vepv = ensembl_name.split("_vep_") return symlink_dir, species elif ensembl_name: species, vepv = ensembl_name.split("_vep_") vep_dir = utils.safe_makedir( os.path.normpath( os.path.join(os.path.dirname(os.path.dirname(ref_file)), "vep"))) out_dir = os.path.join(vep_dir, species, vepv) if not os.path.exists(out_dir): tmp_dir = utils.safe_makedir( os.path.join(vep_dir, species, "txtmp")) eversion = vepv.split("_")[0] url = "ftp://ftp.ensembl.org/pub/release-%s/variation/VEP/%s.tar.gz" % ( eversion, ensembl_name) with utils.chdir(tmp_dir): subprocess.check_call( ["wget", "--no-check-certificate", "-c", url]) vep_path = "%s/bin/" % tooldir if tooldir else "" perl_exports = utils.get_perl_exports() cmd = [ "%svep_install" % vep_path, "-a", "c", "-s", ensembl_name, "-c", vep_dir, "-u", tmp_dir, "--NO_UPDATE", "--VERSION", eversion ] do.run("%s && %s" % (perl_exports, " ".join(cmd)), "Prepare VEP directory for %s" % ensembl_name) cmd = [ "%svep_convert_cache" % vep_path, "--species", species, "--version", vepv, "--dir", vep_dir, "--force_overwrite", "--remove" ] do.run("%s && %s" % (perl_exports, " ".join(cmd)), "Convert VEP cache to tabix %s" % ensembl_name) for tmp_fname in os.listdir(tmp_dir): os.remove(os.path.join(tmp_dir, tmp_fname)) os.rmdir(tmp_dir) tmp_dir = os.path.join(vep_dir, "tmp") if os.path.exists(tmp_dir): shutil.rmtree(tmp_dir) return vep_dir, species return None, None
def _run_scalpel_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect indels with Scalpel. Single sample mode. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: if len(align_bams) > 1: message = ("Scalpel does not currently support batch calling!") raise ValueError(message) input_bams = " ".join("%s" % x for x in align_bams) tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0] tx_tmp_path = "%s-scalpel-work" % utils.splitext_plus( tx_out_file)[0] if os.path.exists(tmp_path): utils.remove_safe(tmp_path) opts = " ".join( _scalpel_options_from_config(items, config, out_file, region, tmp_path)) opts += " --dir %s" % tx_tmp_path min_cov = "3" # minimum coverage opts += " --mincov %s" % min_cov perl_exports = utils.get_perl_exports(os.path.dirname(tx_out_file)) cmd = ( "{perl_exports} && " "scalpel-discovery --single {opts} --ref {ref_file} --bam {input_bams} " ) do.run(cmd.format(**locals()), "Genotyping with Scalpel", {}) shutil.move(tx_tmp_path, tmp_path) # parse produced variant file further scalpel_tmp_file = bgzip_and_index( os.path.join(tmp_path, "variants.indel.vcf"), config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression( "chi2", config) sample_name_str = items[0]["name"][1] fix_ambig = vcfutils.fix_ambiguous_cl() cl2 = ( "{bcftools_cmd_chi2} {scalpel_tmp_file} | " r"sed 's/FORMAT\tsample\(_name\)\{{0,1\}}/FORMAT\t{sample_name_str}/g' " "| {fix_ambig} | vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | vcfstreamsort " "{compress_cmd} > {tx_out_file}") do.run(cl2.format(**locals()), "Finalising Scalpel variants", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def run_vep(in_file, data): """Annotate input VCF file with Ensembl variant effect predictor. """ if not vcfutils.vcf_has_variants(in_file): return None out_file = utils.append_stem(in_file, "-vepeffects") assert in_file.endswith(".gz") and out_file.endswith(".gz") if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: vep_dir, ensembl_name = prep_vep_cache(data["genome_build"], tz.get_in(["reference", "fasta", "base"], data)) if vep_dir: cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1) fork_args = ["--fork", str(cores)] if cores > 1 else [] vep = config_utils.get_program("variant_effect_predictor.pl", data["config"]) is_human = tz.get_in(["genome_resources", "aliases", "human"], data, False) if is_human: dbnsfp_args, dbnsfp_fields = _get_dbnsfp(data) loftee_args, loftee_fields = _get_loftee(data) prediction_args = ["--sift", "b", "--polyphen", "b"] prediction_fields = ["PolyPhen", "SIFT"] else: dbnsfp_args, dbnsfp_fields = [], [] loftee_args, loftee_fields = [], [] prediction_args, prediction_fields = [], [] if tz.get_in(("config", "algorithm", "clinical_reporting"), data, False): # In case of clinical reporting, we need one and only one variant per gene # http://useast.ensembl.org/info/docs/tools/vep/script/vep_other.html#pick # Also use hgvs reporting but requires indexing the reference file clinical_args = ["--pick", "--hgvs", "--shift_hgvs", "1", "--fasta", dd.get_ref_file(data)] clinical_fields = ["HGVSc", "HGVSp"] else: clinical_args, clinical_fields = [], [] std_fields = ["Consequence", "Codons", "Amino_acids", "Gene", "SYMBOL", "Feature", "EXON"] + prediction_fields + ["Protein_position", "BIOTYPE", "CANONICAL", "CCDS"] resources = config_utils.get_resources("vep", data["config"]) extra_args = [str(x) for x in resources.get("options", [])] cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \ ["--species", ensembl_name, "--no_stats", "--cache", "--offline", "--dir", vep_dir, "--symbol", "--numbers", "--biotype", "--total_length", "--canonical", "--gene_phenotype", "--ccds", "--fields", ",".join(std_fields + dbnsfp_fields + loftee_fields + clinical_fields)] + \ prediction_args + dbnsfp_args + loftee_args + clinical_args perl_exports = utils.get_perl_exports() # Remove empty fields (';;') which can cause parsing errors downstream cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (perl_exports, " ".join(cmd), tx_out_file) do.run(cmd, "Ensembl variant effect predictor", data) if utils.file_exists(out_file): vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def run_vep(in_file, data): """Annotate input VCF file with Ensembl variant effect predictor. """ if not vcfutils.vcf_has_variants(in_file): return None out_file = utils.append_stem(in_file, "-vepeffects") assert in_file.endswith(".gz") and out_file.endswith(".gz") if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: vep_dir, ensembl_name = prep_vep_cache(data["genome_build"], tz.get_in(["reference", "fasta", "base"], data)) if vep_dir: cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1) fork_args = ["--fork", str(cores)] if cores > 1 else [] vep = config_utils.get_program("variant_effect_predictor.pl", data["config"]) is_human = tz.get_in(["genome_resources", "aliases", "human"], data, False) config_args, config_fields, prediction_fields = [], [], [] if is_human: plugin_fns = {"dbnsfp": _get_dbnsfp, "loftee": _get_loftee, "dbscsnv": _get_dbscsnv, "maxentscan": _get_maxentscan, "genesplicer": _get_genesplicer} plugins = tz.get_in(("config", "resources", "vep", "plugins"), data, ["dbnsfp", "loftee"]) for plugin in plugins: plugin_args, plugin_fields = plugin_fns[plugin](data) config_args += plugin_args config_fields += plugin_fields config_args += ["--sift", "b", "--polyphen", "b"] prediction_fields += ["PolyPhen", "SIFT"] # Use HGVS by default, requires indexing the reference genome config_args += ["--hgvs", "--shift_hgvs", "1", "--fasta", dd.get_ref_file(data)] config_fields += ["HGVSc", "HGVSp"] if (dd.get_effects_transcripts(data).startswith("canonical") or tz.get_in(("config", "algorithm", "clinical_reporting"), data)): config_args += ["--pick"] std_fields = ["Consequence", "Codons", "Amino_acids", "Gene", "SYMBOL", "Feature", "EXON"] + prediction_fields + ["Protein_position", "BIOTYPE", "CANONICAL", "CCDS"] resources = config_utils.get_resources("vep", data["config"]) extra_args = [str(x) for x in resources.get("options", [])] cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \ ["--species", ensembl_name, "--no_stats", "--cache", "--offline", "--dir", vep_dir, "--symbol", "--numbers", "--biotype", "--total_length", "--canonical", "--gene_phenotype", "--ccds", "--fields", ",".join(std_fields + config_fields)] + config_args perl_exports = utils.get_perl_exports() # Remove empty fields (';;') which can cause parsing errors downstream cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (perl_exports, " ".join(cmd), tx_out_file) do.run(cmd, "Ensembl variant effect predictor", data) if utils.file_exists(out_file): vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def prep_vep_cache(dbkey, ref_file, tooldir=None, config=None): """Ensure correct installation of VEP cache file. """ if config is None: config = {} resource_file = os.path.join(os.path.dirname(ref_file), "%s-resources.yaml" % dbkey) if os.path.exists(resource_file): with open(resource_file) as in_handle: resources = yaml.safe_load(in_handle) ensembl_name = tz.get_in(["aliases", "ensembl"], resources) symlink_dir = _special_dbkey_maps(dbkey, ref_file) if ensembl_name and ensembl_name.find("_vep_") == -1: raise ValueError("%s has ensembl an incorrect value." "It should have _vep_ in the name." "Remove line or fix the name to avoid error.") if symlink_dir and ensembl_name: species, vepv = ensembl_name.split("_vep_") return symlink_dir, species elif ensembl_name: species, vepv = ensembl_name.split("_vep_") vep_dir = utils.safe_makedir(os.path.normpath(os.path.join( os.path.dirname(os.path.dirname(ref_file)), "vep"))) out_dir = os.path.join(vep_dir, species, vepv) if not os.path.exists(out_dir): tmp_dir = utils.safe_makedir(os.path.join(vep_dir, species, "txtmp")) eversion = vepv.split("_")[0] url = "http://ftp.ensembl.org/pub/release-%s/variation/VEP/%s.tar.gz" % (eversion, ensembl_name) with utils.chdir(tmp_dir): subprocess.check_call(["wget", "--no-check-certificate", "-c", url]) vep_path = "%s/bin/" % tooldir if tooldir else "" perl_exports = utils.get_perl_exports() cmd = ["%svep_install" % vep_path, "-a", "c", "-s", ensembl_name, "-c", vep_dir, "-u", tmp_dir, "--NO_UPDATE", "--VERSION", eversion] do.run("%s && %s" % (perl_exports, " ".join(cmd)), "Prepare VEP directory for %s" % ensembl_name) cmd = ["%svep_convert_cache" % vep_path, "--species", species, "--version", vepv, "--dir", vep_dir, "--force_overwrite", "--remove"] do.run("%s && %s" % (perl_exports, " ".join(cmd)), "Convert VEP cache to tabix %s" % ensembl_name) for tmp_fname in os.listdir(tmp_dir): os.remove(os.path.join(tmp_dir, tmp_fname)) os.rmdir(tmp_dir) tmp_dir = os.path.join(vep_dir, "tmp") if os.path.exists(tmp_dir): shutil.rmtree(tmp_dir) return vep_dir, species return None, None
def _run_scalpel_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect indels with Scalpel. Single sample mode. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: if len(align_bams) > 1: message = ("Scalpel does not currently support batch calling!") raise ValueError(message) input_bams = " ".join("%s" % x for x in align_bams) tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0] tx_tmp_path = "%s-scalpel-work" % utils.splitext_plus(tx_out_file)[0] if os.path.exists(tmp_path): utils.remove_safe(tmp_path) opts = " ".join(_scalpel_options_from_config(items, config, out_file, region, tmp_path)) opts += " --dir %s" % tx_tmp_path min_cov = "3" # minimum coverage opts += " --mincov %s" % min_cov perl_exports = utils.get_perl_exports(os.path.dirname(tx_out_file)) cmd = ("{perl_exports} && " "scalpel-discovery --single {opts} --ref {ref_file} --bam {input_bams} ") do.run(cmd.format(**locals()), "Genotyping with Scalpel", {}) shutil.move(tx_tmp_path, tmp_path) # parse produced variant file further scalpel_tmp_file = bgzip_and_index(os.path.join(tmp_path, "variants.indel.vcf"), config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression("chi2", config) sample_name_str = items[0]["name"][1] fix_ambig = vcfutils.fix_ambiguous_cl() cl2 = ("{bcftools_cmd_chi2} {scalpel_tmp_file} | " r"sed 's/FORMAT\tsample\(_name\)\{{0,1\}}/FORMAT\t{sample_name_str}/g' " "| {fix_ambig} | vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | vcfstreamsort " "{compress_cmd} > {tx_out_file}") do.run(cl2.format(**locals()), "Finalising Scalpel variants", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def run(data): config = data[0][0]['config'] work_dir = dd.get_work_dir(data[0][0]) genome = dd.get_ref_file(data[0][0]) mirdeep2 = os.path.join(os.path.dirname(sys.executable), "miRDeep2.pl") perl_exports = get_perl_exports() hairpin, mature, species = "none", "none", "na" rfam_file = dd.get_mirdeep2_file(data[0][0]) if file_exists(dd.get_mirbase_hairpin(data[0][0])): species = dd.get_species(data[0][0]) hairpin = dd.get_mirbase_hairpin(data[0][0]) mature = dd.get_mirbase_mature(data[0][0]) logger.debug("Preparing for mirdeep2 analysis.") bam_file = op.join(work_dir, "align", "seqs.bam") seqs_dir = op.join(work_dir, "seqcluster", "prepare") collapsed = op.join(seqs_dir, "seqs.ma") out_dir = op.join(work_dir, "mirdeep2") out_file = op.join(out_dir, "result_res.csv") safe_makedir(out_dir) if not file_exists(rfam_file): logger.warning("mirdeep2 Rfam file not instaled. Skipping...") return None if not file_exists(mirdeep2): logger.warning("mirdeep2 executable file not found. Skipping...") return None with chdir(out_dir): collapsed, bam_file = _prepare_inputs(collapsed, bam_file, out_dir) cmd = ( "{perl_exports} && perl {mirdeep2} {collapsed} {genome} {bam_file} {mature} none {hairpin} -f {rfam_file} -r simple -c -P -t {species} -z res" ).format(**locals()) if not file_exists(out_file): try: do.run(cmd.format(**locals()), "Running mirdeep2.") except: logger.warning( "mirdeep2 failed. Please report the error to https://github.com/lpantano/mirdeep2_core/issues." ) if file_exists(out_file): novel_db = _parse_novel(out_file, dd.get_species(data[0][0])) return novel_db
def _do_run(paired): """Perform Battenberg caling with the paired dataset. This purposely does not use a temporary directory for the output since Battenberg does smart restarts. """ work_dir = _sv_workdir(paired.tumor_data) ignore_file = os.path.join(work_dir, "ignore_chromosomes.txt") out = _get_battenberg_out(paired, work_dir) if len(_missing_files(out)) > 0: ref_file = dd.get_ref_file(paired.tumor_data) bat_datadir = os.path.normpath( os.path.join(os.path.dirname(ref_file), os.pardir, "battenberg")) ignore_file = _make_ignore_file( work_dir, ref_file, os.path.join(bat_datadir, "impute", "impute_info.txt"), ignore_file) local_sitelib = os.path.join( install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") perl_exports = utils.get_perl_exports() tumor_bam = paired.tumor_bam normal_bam = paired.normal_bam platform = dd.get_platform(paired.tumor_data) genome_build = paired.tumor_data["genome_build"] # scale cores to avoid over-using memory during imputation cores = max(1, int(dd.get_num_cores(paired.tumor_data) * 0.5)) cmd = ( "export R_LIBS_USER={local_sitelib} && " "{perl_exports} && " "battenberg.pl -t {cores} -o {work_dir} -r {ref_file}.fai " "-tb {tumor_bam} -nb {normal_bam} -e {bat_datadir}/impute/impute_info.txt " "-u {bat_datadir}/1000genomesloci -c {bat_datadir}/probloci.txt " "-ig {ignore_file} " "-assembly {genome_build} -species Human -platform {platform}") do.run(cmd.format(**locals()), "Battenberg CNV calling") assert len(_missing_files( out)) == 0, "Missing Battenberg output: %s" % _missing_files(out) out["ignore"] = ignore_file return out
def _trna_annotation(data): """ use tDRmapper to quantify tRNAs """ trna_ref = op.join(dd.get_srna_trna_file(data)) name = dd.get_sample_name(data) work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "trna", name)) in_file = op.basename(data["clean_fastq"]) tdrmapper = os.path.join(os.path.dirname(sys.executable), "TdrMappingScripts.pl") perl_export = utils.get_perl_exports() if not file_exists(trna_ref) or not file_exists(tdrmapper): logger.info("There is no tRNA annotation to run TdrMapper.") return work_dir out_file = op.join(work_dir, in_file + ".hq_cs.mapped") if not file_exists(out_file): with tx_tmpdir(data) as txdir: with utils.chdir(txdir): utils.symlink_plus(data["clean_fastq"], op.join(txdir, in_file)) cmd = ("{perl_export} && perl {tdrmapper} {trna_ref} {in_file}").format(**locals()) do.run(cmd, "tRNA for %s" % name) for filename in glob.glob("*mapped*"): shutil.move(filename, work_dir) return work_dir
def run_vep(in_file, data): """Annotate input VCF file with Ensembl variant effect predictor. """ if not vcfutils.vcf_has_variants(in_file): return None out_file = utils.append_stem(in_file, "-vepeffects") assert in_file.endswith(".gz") and out_file.endswith(".gz") if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: vep_dir, ensembl_name = prep_vep_cache( data["genome_build"], tz.get_in(["reference", "fasta", "base"], data)) if vep_dir: cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1) fork_args = ["--fork", str(cores)] if cores > 1 else [] vep = config_utils.get_program("variant_effect_predictor.pl", data["config"]) is_human = tz.get_in(["genome_resources", "aliases", "human"], data, False) config_args, config_fields, prediction_fields = [], [], [] if is_human: plugin_fns = { "dbnsfp": _get_dbnsfp, "loftee": _get_loftee, "dbscsnv": _get_dbscsnv, "maxentscan": _get_maxentscan, "genesplicer": _get_genesplicer } plugins = tz.get_in( ("config", "resources", "vep", "plugins"), data, ["dbnsfp", "loftee"]) for plugin in plugins: plugin_args, plugin_fields = plugin_fns[plugin](data) config_args += plugin_args plugin_fields += plugin_fields config_args += ["--sift", "b", "--polyphen", "b"] prediction_fields += ["PolyPhen", "SIFT"] # Use HGVS by default, requires indexing the reference genome config_args += [ "--hgvs", "--shift_hgvs", "1", "--fasta", dd.get_ref_file(data) ] config_fields += ["HGVSc", "HGVSp"] if (dd.get_effects_transcripts(data).startswith("canonical") or tz.get_in( ("config", "algorithm", "clinical_reporting"), data)): config_args += ["--pick"] std_fields = [ "Consequence", "Codons", "Amino_acids", "Gene", "SYMBOL", "Feature", "EXON" ] + prediction_fields + [ "Protein_position", "BIOTYPE", "CANONICAL", "CCDS" ] resources = config_utils.get_resources("vep", data["config"]) extra_args = [str(x) for x in resources.get("options", [])] cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \ ["--species", ensembl_name, "--no_stats", "--cache", "--offline", "--dir", vep_dir, "--symbol", "--numbers", "--biotype", "--total_length", "--canonical", "--gene_phenotype", "--ccds", "--fields", ",".join(std_fields + config_fields)] + config_args perl_exports = utils.get_perl_exports() # Remove empty fields (';;') which can cause parsing errors downstream cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % ( perl_exports, " ".join(cmd), tx_out_file) do.run(cmd, "Ensembl variant effect predictor", data) if utils.file_exists(out_file): vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def run_vep(in_file, data): """Annotate input VCF file with Ensembl variant effect predictor. """ if not vcfutils.vcf_has_variants(in_file): return None out_file = utils.append_stem(in_file, "-vepeffects") assert in_file.endswith(".gz") and out_file.endswith(".gz") if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: vep_dir, ensembl_name = prep_vep_cache(data["genome_build"], tz.get_in(["reference", "fasta", "base"], data)) if vep_dir: cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1) fork_args = ["--fork", str(cores)] if cores > 1 else [] vep = config_utils.get_program("variant_effect_predictor.pl", data["config"]) is_human = tz.get_in(["genome_resources", "aliases", "human"], data, False) if is_human: dbnsfp_args, dbnsfp_fields = _get_dbnsfp(data) loftee_args, loftee_fields = _get_loftee(data) prediction_args = ["--sift", "b", "--polyphen", "b"] prediction_fields = ["PolyPhen", "SIFT"] else: dbnsfp_args, dbnsfp_fields = [], [] loftee_args, loftee_fields = [], [] prediction_args, prediction_fields = [], [] std_fields = ["Consequence", "Codons", "Amino_acids", "Gene", "SYMBOL", "Feature", "EXON"] + prediction_fields + ["Protein_position", "BIOTYPE", "CANONICAL", "CCDS"] resources = config_utils.get_resources("vep", data["config"]) extra_args = [str(x) for x in resources.get("options", [])] cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \ ["--species", ensembl_name, "--no_stats", "--cache", "--offline", "--dir", vep_dir, "--symbol", "--numbers", "--biotype", "--total_length", "--canonical", "--ccds", "--fields", ",".join(std_fields + dbnsfp_fields + loftee_fields)] + \ prediction_args + dbnsfp_args + loftee_args if tz.get_in(("config", "algorithm", "clinical_reporting"), data, False): # In case of clinical reporting, we need one and only one # variant per gene # From the VEP docs: # "Pick once line of consequence data per variant, # including transcript-specific columns. Consequences are # chosen by the canonical, biotype status and length of the # transcript, along with the ranking of the consequence # type according to this table. This is the best method to # use if you are interested only in one consequence per # variant. cmd += ["--pick"] # TODO investigate hgvs reporting but requires indexing the reference file # cmd += ["--hgvs", "--shift-hgvs", "--fasta", dd.get_ref_file(data)] perl_exports = utils.get_perl_exports() # Remove empty fields (';;') which can cause parsing errors downstream cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (perl_exports, " ".join(cmd), tx_out_file) do.run(cmd, "Ensembl variant effect predictor", data) if utils.file_exists(out_file): vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def run_vep(in_file, data): """Annotate input VCF file with Ensembl variant effect predictor. """ if not vcfutils.vcf_has_variants(in_file): return None out_file = utils.append_stem(in_file, "-vepeffects") assert in_file.endswith(".gz") and out_file.endswith(".gz") if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: vep_dir, ensembl_name = prep_vep_cache( data["genome_build"], tz.get_in(["reference", "fasta", "base"], data)) if vep_dir: cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1) fork_args = ["--fork", str(cores)] if cores > 1 else [] vep = config_utils.get_program("variant_effect_predictor.pl", data["config"]) is_human = tz.get_in(["genome_resources", "aliases", "human"], data, False) if is_human: dbnsfp_args, dbnsfp_fields = _get_dbnsfp(data) loftee_args, loftee_fields = _get_loftee(data) prediction_args = ["--sift", "b", "--polyphen", "b"] prediction_fields = ["PolyPhen", "SIFT"] else: dbnsfp_args, dbnsfp_fields = [], [] loftee_args, loftee_fields = [], [] prediction_args, prediction_fields = [], [] if tz.get_in(("config", "algorithm", "clinical_reporting"), data, False): # In case of clinical reporting, we need one and only one variant per gene # http://useast.ensembl.org/info/docs/tools/vep/script/vep_other.html#pick # Also use hgvs reporting but requires indexing the reference file clinical_args = [ "--pick", "--hgvs", "--shift_hgvs", "1", "--fasta", dd.get_ref_file(data) ] clinical_fields = ["HGVSc", "HGVSp"] else: clinical_args, clinical_fields = [], [] std_fields = [ "Consequence", "Codons", "Amino_acids", "Gene", "SYMBOL", "Feature", "EXON" ] + prediction_fields + [ "Protein_position", "BIOTYPE", "CANONICAL", "CCDS" ] resources = config_utils.get_resources("vep", data["config"]) extra_args = [str(x) for x in resources.get("options", [])] cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \ ["--species", ensembl_name, "--no_stats", "--cache", "--offline", "--dir", vep_dir, "--symbol", "--numbers", "--biotype", "--total_length", "--canonical", "--gene_phenotype", "--ccds", "--fields", ",".join(std_fields + dbnsfp_fields + loftee_fields + clinical_fields)] + \ prediction_args + dbnsfp_args + loftee_args + clinical_args perl_exports = utils.get_perl_exports() # Remove empty fields (';;') which can cause parsing errors downstream cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % ( perl_exports, " ".join(cmd), tx_out_file) do.run(cmd, "Ensembl variant effect predictor", data) if utils.file_exists(out_file): vcfutils.bgzip_and_index(out_file, data["config"]) return out_file