def rnaseq_vardict_variant_calling(data): sample = dd.get_sample_name(data) variation_dir = os.path.join(dd.get_work_dir(data), "variation") safe_makedir(variation_dir) out_file = os.path.join(variation_dir, sample + "-vardict.vcf.gz") if file_exists(out_file): data = dd.set_vrn_file(data, out_file) return data vardict_cmd = vardict.get_vardict_command(data) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" vcfstreamsort = config_utils.get_program("vcfstreamsort", data) compress_cmd = "| bgzip -c" freq = float(dd.get_min_allele_fraction(data, 20) / 100.0) var2vcf_opts = "-v 50" fix_ambig = vcfutils.fix_ambiguous_cl() remove_dup = vcfutils.remove_dup_cl() r_setup = ("unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname(Rscript_cmd())) ref_file = dd.get_ref_file(data) bamfile = dd.get_work_bam(data) bed_file = gtf.gtf_to_bed(dd.get_gtf_file(data)) opts = " -c 1 -S 2 -E 3 -g 4 " with file_transaction(out_file) as tx_out_file: jvm_opts = vardict._get_jvm_opts(data, tx_out_file) cmd = ("{r_setup}{jvm_opts}{vardict_cmd} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} {bed_file} " "| {strandbias}" "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} " "| {fix_ambig} | {remove_dup} | {vcfstreamsort} {compress_cmd} " "> {tx_out_file}") message = "Calling RNA-seq variants with VarDict" do.run(cmd.format(**locals()), message) data = dd.set_vrn_file(data, out_file) return data
def ensure_annotations(resources, data): """Prepare any potentially missing annotations for downstream processing in a local directory. """ transcript_gff = tz.get_in(["rnaseq", "transcripts"], resources) if transcript_gff and utils.file_exists(transcript_gff): out_dir = os.path.join(tz.get_in(["dirs", "work"], data), "inputs", "data", "annotations") resources["rnaseq"]["gene_bed"] = gtf.gtf_to_bed(transcript_gff, out_dir) return resources
def _ensure_annotations(resources, data): """Prepare any potentially missing annotations for downstream processing in a local directory. """ transcript_gff = tz.get_in(["rnaseq", "transcripts"], resources) if transcript_gff and utils.file_exists(transcript_gff): out_dir = utils.safe_makedir(os.path.join(tz.get_in(["dirs", "work"], data), "inputs", "data", "annotations")) resources["rnaseq"]["gene_bed"] = gtf.gtf_to_bed(transcript_gff, out_dir) return resources
def _setup_variant_regions(data): """Ensure we have variant regions for calling, using transcript if not present. Respects noalt_calling by removing additional contigs to improve speeds. """ vr_file = dd.get_variant_regions(data) if not vr_file: vr_file = gtf.gtf_to_bed(dd.get_gtf_file(data)) contigs = set([c.name for c in ref.file_contigs(dd.get_ref_file(data))]) out_file = os.path.join( utils.safe_makedir(os.path.join(dd.get_work_dir(data), "bedprep")), "%s-rnaseq_clean.bed" % utils.splitext_plus(os.path.basename(vr_file))[0]) if not utils.file_uptodate(out_file, vr_file): with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: with shared.bedtools_tmpdir(data): for r in pybedtools.BedTool(vr_file): if r.chrom in contigs: if chromhacks.is_nonalt(r.chrom): out_handle.write(str(r)) data = dd.set_variant_regions(data, out_file) return data