def make_bcbiornaseq_object(data): """ load the initial bcb.rda object using bcbioRNASeq """ if "bcbiornaseq" not in dd.get_tools_on(data): return data upload_dir = tz.get_in(("upload", "dir"), data) report_dir = os.path.join(upload_dir, "bcbioRNASeq") safe_makedir(report_dir) organism = dd.get_bcbiornaseq(data).get("organism", None) groups = dd.get_bcbiornaseq(data).get("interesting_groups", None) loadstring = create_load_string(upload_dir, groups, organism, "gene") r_file = os.path.join(report_dir, "load_bcbioRNAseq.R") with file_transaction(r_file) as tmp_file: memoize_write_file(loadstring, tmp_file) rcmd = Rscript_cmd() with chdir(report_dir): do.run([rcmd, "--vanilla", r_file], "Loading bcbioRNASeq object.") write_counts(os.path.join(report_dir, "data", "bcb.rda"), "gene") loadstring = create_load_string(upload_dir, groups, organism, "transcript") r_file = os.path.join(report_dir, "load_transcript_bcbioRNAseq.R") with file_transaction(r_file) as tmp_file: memoize_write_file(loadstring, tmp_file) rcmd = Rscript_cmd() with chdir(report_dir): do.run([rcmd, "--vanilla", r_file], "Loading transcript-level bcbioRNASeq object.") write_counts(os.path.join(report_dir, "data-transcript", "bcb.rda"), "transcript") make_quality_report(data) return data
def rnaseq_vardict_variant_calling(data): sample = dd.get_sample_name(data) variation_dir = os.path.join(dd.get_work_dir(data), "variation") safe_makedir(variation_dir) out_file = os.path.join(variation_dir, sample + "-vardict.vcf.gz") if file_exists(out_file): data = dd.set_vrn_file(data, out_file) return data vardict_cmd = vardict.get_vardict_command(data) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" vcfstreamsort = config_utils.get_program("vcfstreamsort", data) compress_cmd = "| bgzip -c" freq = float(dd.get_min_allele_fraction(data, 20) / 100.0) var2vcf_opts = "-v 50" fix_ambig = vcfutils.fix_ambiguous_cl() remove_dup = vcfutils.remove_dup_cl() r_setup = ("unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname(Rscript_cmd())) ref_file = dd.get_ref_file(data) bamfile = dd.get_work_bam(data) bed_file = gtf.gtf_to_bed(dd.get_gtf_file(data)) opts = " -c 1 -S 2 -E 3 -g 4 " with file_transaction(out_file) as tx_out_file: jvm_opts = vardict._get_jvm_opts(data, tx_out_file) cmd = ("{r_setup}{jvm_opts}{vardict_cmd} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} {bed_file} " "| {strandbias}" "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} " "| {fix_ambig} | {remove_dup} | {vcfstreamsort} {compress_cmd} " "> {tx_out_file}") message = "Calling RNA-seq variants with VarDict" do.run(cmd.format(**locals()), message) data = dd.set_vrn_file(data, out_file) return data
def write_counts(bcb, level="gene"): """ pull counts and metadata out of the bcbioRNASeq object """ date = dt.strftime(dt.now(), "%Y-%m-%d") out_dir = os.path.join(os.path.dirname(bcb), "..", "results", date, level, "counts") out_dir_string = _quotestring(out_dir) out_file = os.path.join(out_dir, "counts.csv.gz") safe_makedir(out_dir) if file_exists(out_file): return out_file bcb_string = _quotestring(bcb) rcmd = Rscript_cmd() render_string = ( f'load({bcb_string});' f'date=format(Sys.time(), "%Y-%m-%d");' f'dir={out_dir_string};' f'library(tidyverse);' f'library(bcbioRNASeq);' f'counts = bcbioRNASeq::counts(bcb) %>% as.data.frame() %>% round() %>% tibble::rownames_to_column("gene");' f'metadata = colData(bcb) %>% as.data.frame() %>% tibble::rownames_to_column("sample");' f'readr::write_csv(counts, file.path(dir, "counts.csv.gz"));' f'readr::write_csv(metadata, file.path(dir, "metadata.csv.gz"));') do.run([rcmd, "--vanilla", "-e", render_string], f"Writing counts table to {out_file}.") return out_file
def load_tximport(data): rcmd = Rscript_cmd() salmon_dir = os.path.join(dd.get_work_dir(data), "salmon") tx2gene_file = os.path.join(dd.get_work_dir(data), "inputs", "transcriptome", "tx2gene.csv") out_dir = os.path.join(salmon_dir, "combined") safe_makedir(out_dir) tpm_file = os.path.join(out_dir, "tximport-tpm.csv") counts_file = os.path.join(out_dir, "tximport-counts.csv") if file_exists(tpm_file) and file_exists(counts_file): return {"gene_tpm": tpm_file, "gene_counts": counts_file} with file_transaction(tpm_file) as tx_tpm_file, file_transaction(counts_file) as tx_counts_file: render_string = ( f'library(tidyverse);' f'salmon_files = list.files("{salmon_dir}", pattern="quant.sf", recursive=TRUE, full.names=TRUE);' f'tx2gene = readr::read_csv("{tx2gene_file}", col_names=c("transcript", "gene")); ' f'samples = basename(dirname(salmon_files));' f'names(salmon_files) = samples;' f'txi = tximport::tximport(salmon_files, type="salmon", tx2gene=tx2gene, countsFromAbundance="lengthScaledTPM", dropInfReps=TRUE);' f'readr::write_csv(round(txi$counts) %>% as.data.frame() %>% tibble::rownames_to_column("gene"), "{tx_counts_file}");' f'readr::write_csv(txi$abundance %>% as.data.frame() %>% tibble::rownames_to_column("gene"), "{tx_tpm_file}");' ) do.run([rcmd, "--vanilla", "-e", render_string], f"Loading tximport.") return {"gene_tpm": tpm_file, "gene_counts": counts_file}
def make_scrnaseq_object(samples): """ load the initial se.rda object using sinclecell-experiment """ local_sitelib = R_sitelib() counts_dir = os.path.dirname( dd.get_in_samples(samples, dd.get_combined_counts)) gtf_file = dd.get_in_samples(samples, dd.get_transcriptome_gtf) if not gtf_file: gtf_file = dd.get_in_samples(samples, dd.get_gtf_file) rda_file = os.path.join(counts_dir, "se.rda") if not file_exists(rda_file): with file_transaction(rda_file) as tx_out_file: rcode = "%s-run.R" % os.path.splitext(rda_file)[0] rrna_file = "%s-rrna.txt" % os.path.splitext(rda_file)[0] rrna_file = _find_rRNA_genes(gtf_file, rrna_file) with open(rcode, "w") as out_handle: out_handle.write(_script.format(**locals())) rscript = Rscript_cmd() try: # do.run([rscript, "--vanilla", rcode], # "SingleCellExperiment", # log_error=False) rda_file = rcode except subprocess.CalledProcessError as msg: logger.exception()
def load_summarizedexperiment(samples): """ create summarizedexperiment rds object fails with n_samples = 1 """ # using r36 (4.0) - will eventually drop R3.5 rcmd = Rscript_cmd("r36") se_script = os.path.join(os.path.dirname(__file__), os.pardir, "scripts", "R", "bcbio2se.R") data = samples[0][0] work_dir = dd.get_work_dir(data) out_dir = os.path.join(work_dir, "salmon") summarized_experiment = os.path.join(out_dir, "bcbio-se.rds") if not file_exists(summarized_experiment): with file_transaction(summarized_experiment) as tx_out_file: cmd = f"{rcmd} --vanilla {se_script} {work_dir} {tx_out_file}" message = f"Loading SummarizedExperiment." try: do.run(cmd, message) except Exception: logger.error("SE creation failed") if file_exists(summarized_experiment): try: se_qc_report = generate_se_qc_report(work_dir) except Exception: se_qc_report = None logger.error("SE QC failed") updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_summarized_experiment(data, summarized_experiment) updated_samples.append([data]) return updated_samples else: return samples
def _sleuthify_sailfish(sailfish_dir): """ if installed, use wasabi to create abundance.h5 output for use with sleuth """ if not R_package_path("wasabi"): return None else: rscript = Rscript_cmd() cmd = """{rscript} -e 'library("wasabi"); prepare_fish_for_sleuth(c("{sailfish_dir}"))'""" do.run(cmd.format(**locals()), "Converting Sailfish to Sleuth format.") return os.path.join(sailfish_dir, "abundance.h5")
def render_rmarkdown_file(filename): """ render a rmarkdown file using the rmarkdown library """ render_template = Template('rmarkdown::render("$filename")') render_string = render_template.substitute(filename=filename) report_dir = os.path.dirname(filename) rcmd = Rscript_cmd() with chdir(report_dir): do.run([rcmd, "--no-environ", "-e", render_string], "Rendering bcbioRNASeq quality control report.") return filename
def load_summarizedexperiment(data): rcmd = Rscript_cmd() se_script = os.path.join(os.path.dirname(__file__), os.pardir, "scripts", "R", "bcbio2se.R") work_dir = dd.get_work_dir(data) out_dir = os.path.join(work_dir, "salmon") out_file = os.path.join(out_dir, "bcbio-se.rds") if file_exists(out_file): return out_file with file_transaction(out_file) as tx_out_file: cmd = f"{rcmd} --vanilla {se_script} {work_dir} {tx_out_file}" message = f"Loading SummarizedExperiment." do.run(cmd, message) return out_file
def make_bcbiornaseq_object(data): if "bcbiornaseq" not in dd.get_tools_on(data): return data upload_dir = tz.get_in(("upload", "dir"), data) report_dir = os.path.join(upload_dir, "bcbioRNASeq") safe_makedir(report_dir) organism = dd.get_bcbiornaseq(data).get("organism", None) groups = dd.get_bcbiornaseq(data).get("interesting_groups", None) loadstring = create_load_string(upload_dir, groups, organism) r_file = os.path.join(report_dir, "load_bcbioRNAseq.R") with file_transaction(r_file) as tmp_file: write_load_bcbiornaseq_file(loadstring, tmp_file) rcmd = Rscript_cmd() with chdir(report_dir): do.run([rcmd, r_file], "Loading bcbioRNASeq object.") return data
def generate_se_qc_report(work_dir): """ generate QC report based on SE RDS object""" rcmd = Rscript_cmd("r36") qc_script = os.path.join(os.path.dirname(__file__), os.pardir, "scripts", "R", "se2qc.Rmd") out_file = os.path.join(work_dir, "qc", "bcbio-se.html") rds_file = os.path.join(work_dir, "salmon", "bcbio-se.rds") if file_exists(out_file): return out_file with file_transaction(out_file) as tx_out_file: cmd = ( f"""{rcmd} --vanilla """ f"""-e 'rmarkdown::render("{qc_script}", params = list(rds_file="{rds_file}"), output_file="{tx_out_file}")'""" ) message = f"Creating SE QC report" do.run(cmd, message) return out_file
def rmarkdown_draft(filename, template, package): """ create a draft rmarkdown file from an installed template """ if file_exists(filename): return filename draft_template = Template( 'rmarkdown::draft("$filename", template="$template", package="$package", edit=FALSE)' ) draft_string = draft_template.substitute( filename=filename, template=template, package=package) report_dir = os.path.dirname(filename) rcmd = Rscript_cmd() with chdir(report_dir): do.run([rcmd, "--no-environ", "-e", draft_string], "Creating bcbioRNASeq quality control template.") do.run(["sed", "-i", "s/YYYY-MM-DD\///g", filename], "Editing bcbioRNAseq quality control template.") return filename