def _rnaseq_qualimap_cmd(data, bam_file, out_dir, gtf_file=None, library="non-strand-specific"): """ Create command lines for qualimap """ config = data["config"] qualimap = config_utils.get_program("qualimap", config) resources = config_utils.get_resources("qualimap", config) num_cores = resources.get("cores", dd.get_num_cores(data)) max_mem = config_utils.adjust_memory(resources.get("memory", "2G"), num_cores) export = "%s%s" % (utils.java_freetype_fix(), utils.local_path_export()) export = "%s%s export JAVA_OPTS='-Xms32m -Xmx%s -Djava.io.tmpdir=%s' && " % ( utils.java_freetype_fix(), utils.local_path_export(), max_mem, out_dir) if library != "non-strand-specific": logger.info( "Qualimap can get the orientation wrong for stranded reads, so we run it in unstranded mode. This gives comparable results to unstranded for RNA-seq data (see https://groups.google.com/forum/#!topic/qualimap/ZGo-k8LGmHQ) for a further explanation." ) library = "non-strand-specific" paired = " --paired" if bam.is_paired(bam_file) else "" cmd = ("unset DISPLAY && {export} {qualimap} rnaseq -outdir {out_dir} " "-a proportional -bam {bam_file} -p {library}{paired} " "-gtf {gtf_file}").format(**locals()) return cmd
def run(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ # Qualimap results should be saved to a directory named after sample. # MultiQC (for parsing additional data) picks the sample name after the dir as follows: # <sample name>/raw_data_qualimapReport/insert_size_histogram.txt results_dir = os.path.join(out_dir, dd.get_sample_name(data)) resources = config_utils.get_resources("qualimap", data["config"]) options = " ".join(resources.get("options", "")) results_file = os.path.join(results_dir, "genome_results.txt") report_file = os.path.join(results_dir, "qualimapReport.html") utils.safe_makedir(results_dir) pdf_file = "qualimapReport.pdf" if not utils.file_exists(results_file) and not utils.file_exists(os.path.join(results_dir, pdf_file)): if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []): logger.info("Full qualimap analysis for %s may be slow." % bam_file) ds_bam = bam_file else: ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir) bam_file = ds_bam if ds_bam else bam_file if options.find("PDF") > -1: options = "%s -outfile %s" % (options, pdf_file) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) with file_transaction(data, results_dir) as tx_results_dir: utils.safe_makedir(tx_results_dir) export = "%s%s export JAVA_OPTS='-Xms32m -Xmx%s -Djava.io.tmpdir=%s' && " % ( utils.java_freetype_fix(), utils.local_path_export(), max_mem, tx_results_dir) cmd = ("unset DISPLAY && {export} {qualimap} bamqc -bam {bam_file} -outdir {tx_results_dir} " "--skip-duplicated --skip-dup-mode 0 " "-nt {num_cores} {options}") species = None if (tz.get_in(("genome_resources", "aliases", "human"), data, "") or dd.get_genome_build(data).startswith(("hg", "GRCh"))): species = "HUMAN" elif dd.get_genome_build(data).startswith(("mm", "GRCm")): species = "MOUSE" if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = (dd.get_coverage(data) if dd.get_coverage(data) not in [None, False, "None"] else dd.get_variant_regions_merged(data)) if regions: regions = bedutils.merge_overlaps(bedutils.clean_file(regions, data), data) bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" bcbio_env = utils.get_bcbio_env() do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data), env=bcbio_env) tx_results_file = os.path.join(tx_results_dir, "genome_results.txt") cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (dd.get_sample_name(data), tx_results_file) do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data))) # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order # to keep its name after upload, we need to put the base QC file (results_file) into the root directory (out_dir): base_results_file = os.path.join(out_dir, os.path.basename(results_file)) shutil.copyfile(results_file, base_results_file) return {"base": base_results_file, "secondary": _find_qualimap_secondary_files(results_dir, base_results_file)}
def _rnaseq_qualimap_cmd(data, bam_file, out_dir, gtf_file=None, library="non-strand-specific"): """ Create command lines for qualimap """ config = data["config"] qualimap = config_utils.get_program("qualimap", config) resources = config_utils.get_resources("qualimap", config) num_cores = resources.get("cores", dd.get_num_cores(data)) max_mem = config_utils.adjust_memory(resources.get("memory", "2G"), num_cores) export = "%s%s" % (utils.java_freetype_fix(), utils.local_path_export()) export = "%s%s export JAVA_OPTS='-Xms32m -Xmx%s -Djava.io.tmpdir=%s' && " % ( utils.java_freetype_fix(), utils.local_path_export(), max_mem, out_dir) paired = " --paired" if bam.is_paired(bam_file) else "" cmd = ("unset DISPLAY && {export} {qualimap} rnaseq -outdir {out_dir} " "-a proportional -bam {bam_file} -p {library}{paired} " "-gtf {gtf_file}").format(**locals()) return cmd
def run(bam_file, data, fastqc_out): """Run fastqc, generating report in specified directory and parsing metrics. Downsamples to 10 million reads to avoid excessive processing times with large files, unless we're running a Standard/smallRNA-seq/QC pipeline. Handles fastqc 0.11+, which use a single HTML file and older versions that use a directory of files + images. The goal is to eventually move to only 0.11+ """ sentry_file = os.path.join(fastqc_out, "fastqc_report.html") if not os.path.exists(sentry_file): work_dir = os.path.dirname(fastqc_out) utils.safe_makedir(work_dir) ds_file = (bam.downsample(bam_file, data, 1e7, work_dir=work_dir) if data.get("analysis", "").lower() not in ["standard", "smallrna-seq"] else None) if ds_file is not None: bam_file = ds_file frmt = "bam" if bam_file.endswith("bam") else "fastq" fastqc_name = utils.splitext_plus(os.path.basename(bam_file))[0] fastqc_clean_name = dd.get_sample_name(data) num_cores = data["config"]["algorithm"].get("num_cores", 1) with tx_tmpdir(data, work_dir) as tx_tmp_dir: with utils.chdir(tx_tmp_dir): cl = [config_utils.get_program("fastqc", data["config"]), "-d", tx_tmp_dir, "-t", str(num_cores), "--extract", "-o", tx_tmp_dir, "-f", frmt, bam_file] cl = "%s %s %s" % (utils.java_freetype_fix(), utils.local_path_export(), " ".join([str(x) for x in cl])) do.run(cl, "FastQC: %s" % dd.get_sample_name(data)) tx_fastqc_out = os.path.join(tx_tmp_dir, "%s_fastqc" % fastqc_name) tx_combo_file = os.path.join(tx_tmp_dir, "%s_fastqc.html" % fastqc_name) if not os.path.exists(sentry_file) and os.path.exists(tx_combo_file): utils.safe_makedir(fastqc_out) # Use sample name for reports instead of bam file name with open(os.path.join(tx_fastqc_out, "fastqc_data.txt"), 'r') as fastqc_bam_name, \ open(os.path.join(tx_fastqc_out, "_fastqc_data.txt"), 'w') as fastqc_sample_name: for line in fastqc_bam_name: fastqc_sample_name.write(line.replace(os.path.basename(bam_file), fastqc_clean_name)) shutil.move(os.path.join(tx_fastqc_out, "_fastqc_data.txt"), os.path.join(fastqc_out, 'fastqc_data.txt')) shutil.move(tx_combo_file, sentry_file) if os.path.exists("%s.zip" % tx_fastqc_out): shutil.move("%s.zip" % tx_fastqc_out, os.path.join(fastqc_out, "%s.zip" % fastqc_clean_name)) elif not os.path.exists(sentry_file): raise ValueError("FastQC failed to produce output HTML file: %s" % os.listdir(tx_tmp_dir)) logger.info("Produced HTML report %s" % sentry_file) parser = FastQCParser(fastqc_out, dd.get_sample_name(data)) stats = parser.get_fastqc_summary() parser.save_sections_into_file() return stats
def _rnaseq_qualimap_cmd(data, bam_file, out_dir, gtf_file=None, single_end=None, library="non-strand-specific"): """ Create command lines for qualimap """ config = data["config"] qualimap = config_utils.get_program("qualimap", config) resources = config_utils.get_resources("qualimap", config) num_cores = resources.get("cores", dd.get_num_cores(data)) max_mem = config_utils.adjust_memory(resources.get("memory", "2G"), num_cores) export = "%s%s" % (utils.java_freetype_fix(), utils.local_path_export()) cmd = ("unset DISPLAY && {export} {qualimap} rnaseq -outdir {out_dir} " "-a proportional -bam {bam_file} -p {library} " "-gtf {gtf_file} --java-mem-size={max_mem}").format(**locals()) return cmd