def run(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ # Qualimap results should be saved to a directory named after sample. # MultiQC (for parsing additional data) picks the sample name after the dir as follows: # <sample name>/raw_data_qualimapReport/insert_size_histogram.txt results_dir = os.path.join(out_dir, dd.get_sample_name(data)) resources = config_utils.get_resources("qualimap", data["config"]) options = " ".join(resources.get("options", "")) report_file = os.path.join(results_dir, "qualimapReport.html") utils.safe_makedir(results_dir) pdf_file = "qualimapReport.pdf" if not utils.file_exists(report_file) and not utils.file_exists( os.path.join(results_dir, pdf_file)): if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []): logger.info("Full qualimap analysis for %s may be slow." % bam_file) ds_bam = bam_file else: ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir) bam_file = ds_bam if ds_bam else bam_file if options.find("PDF") > -1: options = "%s -outfile %s" % (options, pdf_file) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) # Fixing the file name: MultiQC picks sample name from BAM file name. fixed_bam_fname = os.path.join(out_dir, dd.get_sample_name(data) + ".bam") if not os.path.islink(fixed_bam_fname): os.symlink(bam_file, fixed_bam_fname) export = utils.local_path_export() cmd = ( "unset DISPLAY && {export} {qualimap} bamqc -bam {fixed_bam_fname} -outdir {results_dir} " "--skip-duplicated --skip-dup-mode 0 " "-nt {num_cores} --java-mem-size={max_mem} {options}") species = None if tz.get_in(("genome_resources", "aliases", "human"), data, ""): species = "HUMAN" elif any( tz.get_in("genome_build", data, "").startswith(k) for k in ["mm", "GRCm"]): species = "MOUSE" if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = bedutils.merge_overlaps( dd.get_coverage(data), data) or dd.get_variant_regions_merged(data) if regions: bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data)) # return _parse_qualimap_metrics(report_file, data) return dict()
def run(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ # Qualimap results should be saved to a directory named after sample. # MultiQC (for parsing additional data) picks the sample name after the dir as follows: # <sample name>/raw_data_qualimapReport/insert_size_histogram.txt results_dir = os.path.join(out_dir, dd.get_sample_name(data)) resources = config_utils.get_resources("qualimap", data["config"]) options = " ".join(resources.get("options", "")) results_file = os.path.join(results_dir, "genome_results.txt") report_file = os.path.join(results_dir, "qualimapReport.html") utils.safe_makedir(results_dir) pdf_file = "qualimapReport.pdf" if not utils.file_exists(results_file) and not utils.file_exists(os.path.join(results_dir, pdf_file)): if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []): logger.info("Full qualimap analysis for %s may be slow." % bam_file) ds_bam = bam_file else: ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir) bam_file = ds_bam if ds_bam else bam_file if options.find("PDF") > -1: options = "%s -outfile %s" % (options, pdf_file) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) with file_transaction(data, results_dir) as tx_results_dir: utils.safe_makedir(tx_results_dir) export = "%s%s export JAVA_OPTS='-Xms32m -Xmx%s -Djava.io.tmpdir=%s' && " % ( utils.java_freetype_fix(), utils.local_path_export(), max_mem, tx_results_dir) cmd = ("unset DISPLAY && {export} {qualimap} bamqc -bam {bam_file} -outdir {tx_results_dir} " "--skip-duplicated --skip-dup-mode 0 " "-nt {num_cores} {options}") species = None if (tz.get_in(("genome_resources", "aliases", "human"), data, "") or dd.get_genome_build(data).startswith(("hg", "GRCh"))): species = "HUMAN" elif dd.get_genome_build(data).startswith(("mm", "GRCm")): species = "MOUSE" if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = (dd.get_coverage(data) if dd.get_coverage(data) not in [None, False, "None"] else dd.get_variant_regions_merged(data)) if regions: regions = bedutils.merge_overlaps(bedutils.clean_file(regions, data), data) bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" bcbio_env = utils.get_bcbio_env() do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data), env=bcbio_env) tx_results_file = os.path.join(tx_results_dir, "genome_results.txt") cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (dd.get_sample_name(data), tx_results_file) do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data))) # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order # to keep its name after upload, we need to put the base QC file (results_file) into the root directory (out_dir): base_results_file = os.path.join(out_dir, os.path.basename(results_file)) shutil.copyfile(results_file, base_results_file) return {"base": base_results_file, "secondary": _find_qualimap_secondary_files(results_dir, base_results_file)}
def starts_by_depth(bam_file, data, sample_size=10000000): """ Return a set of x, y points where x is the number of reads sequenced and y is the number of unique start sites identified If sample size < total reads in a file the file will be downsampled. """ binsize = (sample_size / 100) + 1 seen_starts = set() counted = 0 num_reads = [] starts = [] buffer = [] downsampled = bam.downsample(bam_file, data, sample_size) with bam.open_samfile(downsampled) as samfile: for read in samfile: if read.is_unmapped: continue counted += 1 buffer.append(str(read.tid) + ":" + str(read.pos)) if counted % binsize == 0: seen_starts.update(buffer) buffer = [] num_reads.append(counted) starts.append(len(seen_starts)) seen_starts.update(buffer) num_reads.append(counted) starts.append(len(seen_starts)) return pd.DataFrame({"reads": num_reads, "starts": starts})
def run(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ resources = config_utils.get_resources("qualimap", data["config"]) options = " ".join(resources.get("options", "")) report_file = os.path.join(out_dir, "qualimapReport.html") pdf_file = "qualimapReport.pdf" if not utils.file_exists(report_file) and not utils.file_exists(os.path.join(out_dir, pdf_file)): if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []): logger.info("Full qualimap analysis for %s may be slow." % bam_file) ds_bam = bam_file else: ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir) bam_file = ds_bam if ds_bam else bam_file if options.find("PDF") > -1: options = "%s -outfile %s" % (options, pdf_file) utils.safe_makedir(out_dir) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) export = utils.local_path_export() cmd = ("unset DISPLAY && {export} {qualimap} bamqc -bam {bam_file} -outdir {out_dir} " "-nt {num_cores} --java-mem-size={max_mem} {options}") species = tz.get_in(("genome_resources", "aliases", "ensembl"), data, "") if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = bedutils.merge_overlaps(dd.get_variant_regions(data), data) if regions: bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data)) return _parse_qualimap_metrics(report_file)
def _prep_subsampled_bams(data, work_dir): """Prepare a subsampled BAM file with discordants from samblaster and minimal correct pairs. This attempts to minimize run times by pre-extracting useful reads mixed with subsampled normal pairs to estimate paired end distributions: https://groups.google.com/d/msg/delly-users/xmia4lwOd1Q/uaajoBkahAIJ Subsamples correctly aligned reads to 100 million based on speedseq defaults and evaluations on NA12878 whole genome data: https://github.com/cc2qe/speedseq/blob/ca624ba9affb0bd0fb88834ca896e9122639ec94/bin/speedseq#L1102 XXX Currently does not downsample as new versions do not get good sensitivity with downsampled BAMs. """ full_bam, sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir) return [full_bam] ds_bam = bam.downsample(full_bam, data, 1e8, read_filter="-F 'not secondary_alignment and proper_pair'", always_run=True, work_dir=work_dir) out_bam = "%s-final%s" % utils.splitext_plus(ds_bam) if not utils.file_exists(out_bam): bam.merge([ds_bam, sr_bam, disc_bam], out_bam, data["config"]) bam.index(out_bam, data["config"]) return [out_bam]
def _run_fastqc(bam_file, data, fastqc_out): """Run fastqc, generating report in specified directory and parsing metrics. Downsamples to 10 million reads to avoid excessive processing times with large files, unless we're running a Standard/QC pipeline. """ sentry_file = os.path.join(fastqc_out, "fastqc_report.html") if not os.path.exists(sentry_file): work_dir = os.path.dirname(fastqc_out) utils.safe_makedir(work_dir) ds_bam = (bam.downsample(bam_file, data, 1e7) if data.get("analysis", "").lower() not in ["standard"] else None) bam_file = ds_bam if ds_bam else bam_file num_cores = data["config"]["algorithm"].get("num_cores", 1) with utils.curdir_tmpdir(data, work_dir) as tx_tmp_dir: with utils.chdir(tx_tmp_dir): cl = [config_utils.get_program("fastqc", data["config"]), "-t", str(num_cores), "-o", tx_tmp_dir, "-f", "bam", bam_file] do.run(cl, "FastQC: %s" % data["name"][-1]) fastqc_outdir = os.path.join(tx_tmp_dir, "%s_fastqc" % os.path.splitext(os.path.basename(bam_file))[0]) if os.path.exists("%s.zip" % fastqc_outdir): os.remove("%s.zip" % fastqc_outdir) if not os.path.exists(sentry_file): if os.path.exists(fastqc_out): shutil.rmtree(fastqc_out) shutil.move(fastqc_outdir, fastqc_out) if ds_bam and os.path.exists(ds_bam): os.remove(ds_bam) parser = FastQCParser(fastqc_out) stats = parser.get_fastqc_summary() return stats
def run(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ resources = config_utils.get_resources("qualimap", data["config"]) options = " ".join(resources.get("options", "")) report_file = os.path.join(out_dir, "qualimapReport.html") pdf_file = "qualimapReport.pdf" if not utils.file_exists(report_file) and not utils.file_exists(os.path.join(out_dir, pdf_file)): if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []): logger.info("Full qualimap analysis for %s may be slow." % bam_file) ds_bam = bam_file else: ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir) bam_file = ds_bam if ds_bam else bam_file if options.find("PDF") > -1: options = "%s -outfile %s" % (options, pdf_file) utils.safe_makedir(out_dir) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) cmd = ("unset DISPLAY && {qualimap} bamqc -bam {bam_file} -outdir {out_dir} " "-nt {num_cores} --java-mem-size={max_mem} {options}") species = tz.get_in(("genome_resources", "aliases", "ensembl"), data, "") if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = bedutils.merge_overlaps(dd.get_variant_regions(data), data) if regions: bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data)) return _parse_qualimap_metrics(report_file)
def _run_fastqc(bam_file, data, fastqc_out): """Run fastqc, generating report in specified directory and parsing metrics. Downsamples to 10 million reads to avoid excessive processing times with large files, unless we're running a Standard/QC pipeline. """ sentry_file = os.path.join(fastqc_out, "fastqc_report.html") if not os.path.exists(sentry_file): work_dir = os.path.dirname(fastqc_out) utils.safe_makedir(work_dir) ds_bam = (bam.downsample(bam_file, data, 1e7) if data.get( "analysis", "").lower() not in ["standard"] else None) bam_file = ds_bam if ds_bam else bam_file num_cores = data["config"]["algorithm"].get("num_cores", 1) with utils.curdir_tmpdir(data, work_dir) as tx_tmp_dir: with utils.chdir(tx_tmp_dir): cl = [ config_utils.get_program("fastqc", data["config"]), "-t", str(num_cores), "-o", tx_tmp_dir, "-f", "bam", bam_file ] do.run(cl, "FastQC: %s" % data["name"][-1]) fastqc_outdir = os.path.join( tx_tmp_dir, "%s_fastqc" % os.path.splitext(os.path.basename(bam_file))[0]) if os.path.exists("%s.zip" % fastqc_outdir): os.remove("%s.zip" % fastqc_outdir) if not os.path.exists(sentry_file): if os.path.exists(fastqc_out): shutil.rmtree(fastqc_out) shutil.move(fastqc_outdir, fastqc_out) if ds_bam and os.path.exists(ds_bam): os.remove(ds_bam) parser = FastQCParser(fastqc_out) stats = parser.get_fastqc_summary() return stats
def run_rnaseq(bam_file, data, out_dir): """ Run qualimap for a rnaseq bam file and parse results """ strandedness = { "firststrand": "strand-specific-forward", "secondstrand": "strand-specific-reverse", "unstranded": "non-strand-specific", "auto": "non-strand-specific" } # Qualimap results should be saved to a directory named after sample. # MultiQC (for parsing additional data) picks the sample name after the dir as follows: # <sample name>/raw_data_qualimapReport/insert_size_histogram.txt results_dir = os.path.join(out_dir, dd.get_sample_name(data)) results_file = os.path.join(results_dir, "rnaseq_qc_results.txt") report_file = os.path.join(results_dir, "qualimapReport.html") config = data["config"] gtf_file = dd.get_transcriptome_gtf(data, default=dd.get_gtf_file(data)) library = strandedness[dd.get_strandedness(data)] # don't run qualimap on the full bam by default if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []): logger.info(f"Full qualimap analysis for {bam_file} may be slow.") ds_bam = bam_file else: logger.info(f"Downsampling {bam_file} for Qualimap run.") ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir) bam_file = ds_bam if ds_bam else bam_file if not utils.file_exists(results_file): with file_transaction(data, results_dir) as tx_results_dir: utils.safe_makedir(tx_results_dir) bam.index(bam_file, config) cmd = _rnaseq_qualimap_cmd(data, bam_file, tx_results_dir, gtf_file, library) do.run(cmd, "Qualimap for {}".format(dd.get_sample_name(data))) tx_results_file = os.path.join(tx_results_dir, "rnaseq_qc_results.txt") cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % ( dd.get_sample_name(data), tx_results_file) do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data))) metrics = _parse_rnaseq_qualimap_metrics(report_file) metrics.update(_detect_duplicates(bam_file, results_dir, data)) metrics.update(_detect_rRNA(data, results_dir)) metrics.update( {"Average_insert_size": salmon.estimate_fragment_size(data)}) metrics = _parse_metrics(metrics) # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order # to keep its name after upload, we need to put the base QC file (results_file) into the root directory (out_dir): base_results_file = os.path.join(out_dir, os.path.basename(results_file)) shutil.copyfile(results_file, base_results_file) return { "base": base_results_file, "secondary": _find_qualimap_secondary_files(results_dir, base_results_file), "metrics": metrics }
def _prep_subsampled_bams(data, work_dir): """Prepare a subsampled BAM file with discordants from samblaster and minimal correct pairs. This attempts to minimize run times by pre-extracting useful reads mixed with subsampled normal pairs to estimate paired end distributions: https://groups.google.com/d/msg/delly-users/xmia4lwOd1Q/uaajoBkahAIJ Subsamples correctly aligned reads to 100 million based on speedseq defaults and evaluations on NA12878 whole genome data: https://github.com/cc2qe/speedseq/blob/ca624ba9affb0bd0fb88834ca896e9122639ec94/bin/speedseq#L1102 XXX Currently not used as new versions of delly do not get good sensitivity with downsampled BAMs. """ sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir) ds_bam = bam.downsample( dd.get_align_bam(data), data, 1e8, read_filter="-F 'not secondary_alignment and proper_pair'", always_run=True, work_dir=work_dir) out_bam = "%s-final%s" % utils.splitext_plus(ds_bam) if not utils.file_exists(out_bam): bam.merge([ds_bam, sr_bam, disc_bam], out_bam, data["config"]) bam.index(out_bam, data["config"]) return [out_bam]
def _run_qualimap(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ report_file = os.path.join(out_dir, "qualimapReport.html") if not os.path.exists(report_file): ds_bam = bam.downsample(bam_file, data, 1e7) bam_file = ds_bam if ds_bam else bam_file utils.safe_makedir(out_dir) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) resources = config_utils.get_resources("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) cmd = ( "unset DISPLAY && {qualimap} bamqc -bam {bam_file} -outdir {out_dir} " "-nt {num_cores} --java-mem-size={max_mem}") species = tz.get_in(("genome_resources", "aliases", "ensembl"), data, "") if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = bedutils.merge_overlaps(dd.get_variant_regions(data), data) if regions: bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" do.run(cmd.format(**locals()), "Qualimap: %s" % data["name"][-1]) return _parse_qualimap_metrics(report_file)
def _run_qsignature_generator(bam_file, data, out_dir): """ Run SignatureGenerator to create normalize vcf that later will be input of qsignature_summary :param bam_file: (str) path of the bam_file :param data: (list) list containing the all the dictionary for this sample :param out_dir: (str) path of the output :returns: (dict) dict with the normalize vcf file """ position = dd.get_qsig_file(data) mixup_check = dd.get_mixup_check(data) if mixup_check and mixup_check.startswith("qsignature"): if not position: logger.info("There is no qsignature for this species: %s" % tz.get_in(['genome_build'], data)) return {} jvm_opts = "-Xms750m -Xmx2g" limit_reads = 20000000 if mixup_check == "qsignature_full": slice_bam = bam_file jvm_opts = "-Xms750m -Xmx8g" limit_reads = 100000000 else: slice_bam = _slice_chr22(bam_file, data) qsig = config_utils.get_program("qsignature", data["config"]) if not qsig: return {} utils.safe_makedir(out_dir) out_name = os.path.basename(slice_bam).replace("bam", "qsig.vcf") out_file = os.path.join(out_dir, out_name) log_file = os.path.join(out_dir, "qsig.log") cores = dd.get_cores(data) base_cmd = ("{qsig} {jvm_opts} " "org.qcmg.sig.SignatureGenerator " "--noOfThreads {cores} " "-log {log_file} -i {position} " "-i {down_file} ") if not os.path.exists(out_file): down_file = bam.downsample(slice_bam, data, limit_reads) if not down_file: down_file = slice_bam file_qsign_out = "{0}.qsig.vcf".format(down_file) do.run(base_cmd.format(**locals()), "qsignature vcf generation: %s" % data["name"][-1]) if os.path.exists(file_qsign_out): with file_transaction(data, out_file) as file_txt_out: shutil.move(file_qsign_out, file_txt_out) else: raise IOError("File doesn't exist %s" % file_qsign_out) return {'qsig_vcf': out_file} return {}
def run(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ # Qualimap results should be saved to a directory named after sample. # MultiQC (for parsing additional data) picks the sample name after the dir as follows: # <sample name>/raw_data_qualimapReport/insert_size_histogram.txt results_dir = os.path.join(out_dir, dd.get_sample_name(data)) resources = config_utils.get_resources("qualimap", data["config"]) options = " ".join(resources.get("options", "")) report_file = os.path.join(results_dir, "qualimapReport.html") utils.safe_makedir(results_dir) pdf_file = "qualimapReport.pdf" if not utils.file_exists(report_file) and not utils.file_exists(os.path.join(results_dir, pdf_file)): if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []): logger.info("Full qualimap analysis for %s may be slow." % bam_file) ds_bam = bam_file else: ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir) bam_file = ds_bam if ds_bam else bam_file if options.find("PDF") > -1: options = "%s -outfile %s" % (options, pdf_file) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) # Fixing the file name: MultiQC picks sample name from BAM file name. fixed_bam_fname = os.path.join(out_dir, dd.get_sample_name(data) + ".bam") if not os.path.islink(fixed_bam_fname): os.symlink(bam_file, fixed_bam_fname) export = utils.local_path_export() cmd = ("unset DISPLAY && {export} {qualimap} bamqc -bam {fixed_bam_fname} -outdir {results_dir} " "--skip-duplicated --skip-dup-mode 0 " "-nt {num_cores} --java-mem-size={max_mem} {options}") species = None if tz.get_in(("genome_resources", "aliases", "human"), data, ""): species = "HUMAN" elif any(tz.get_in("genome_build", data, "").startswith(k) for k in ["mm", "GRCm"]): species = "MOUSE" if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = bedutils.merge_overlaps(dd.get_coverage(data), data) or dd.get_variant_regions_merged(data) if regions: bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data)) # return _parse_qualimap_metrics(report_file, data) return dict()
def process_lane(item): """Prepare lanes, potentially splitting based on barcodes and reducing the number of reads for a test run """ NUM_DOWNSAMPLE = 10000 logger.debug("Preparing %s" % item["rgnames"]["lane"]) file1, file2 = get_fastq_files(item) if item.get("test_run", False): if bam.is_bam(file1): file1 = bam.downsample(file1, item, NUM_DOWNSAMPLE) else: file1, file2 = fastq.downsample(file1, file2, item, NUM_DOWNSAMPLE, quick=True) item["files"] = (file1, file2) return [item]
def run(bam_file, data, fastqc_out): """Run fastqc, generating report in specified directory and parsing metrics. Downsamples to 10 million reads to avoid excessive processing times with large files, unless we're running a Standard/smallRNA-seq/QC pipeline. Handles fastqc 0.11+, which use a single HTML file and older versions that use a directory of files + images. The goal is to eventually move to only 0.11+ """ sentry_file = os.path.join(fastqc_out, "fastqc_report.html") if not os.path.exists(sentry_file): work_dir = os.path.dirname(fastqc_out) utils.safe_makedir(work_dir) ds_file = (bam.downsample(bam_file, data, 1e7, work_dir=work_dir) if data.get("analysis", "").lower() not in ["standard", "smallrna-seq"] else None) if ds_file is not None: bam_file = ds_file frmt = "bam" if bam_file.endswith("bam") else "fastq" fastqc_name = utils.splitext_plus(os.path.basename(bam_file))[0] fastqc_clean_name = dd.get_sample_name(data) num_cores = data["config"]["algorithm"].get("num_cores", 1) with tx_tmpdir(data, work_dir) as tx_tmp_dir: with utils.chdir(tx_tmp_dir): cl = [config_utils.get_program("fastqc", data["config"]), "-d", tx_tmp_dir, "-t", str(num_cores), "--extract", "-o", tx_tmp_dir, "-f", frmt, bam_file] cl = "%s %s %s" % (utils.java_freetype_fix(), utils.local_path_export(), " ".join([str(x) for x in cl])) do.run(cl, "FastQC: %s" % dd.get_sample_name(data)) tx_fastqc_out = os.path.join(tx_tmp_dir, "%s_fastqc" % fastqc_name) tx_combo_file = os.path.join(tx_tmp_dir, "%s_fastqc.html" % fastqc_name) if not os.path.exists(sentry_file) and os.path.exists(tx_combo_file): utils.safe_makedir(fastqc_out) # Use sample name for reports instead of bam file name with open(os.path.join(tx_fastqc_out, "fastqc_data.txt"), 'r') as fastqc_bam_name, \ open(os.path.join(tx_fastqc_out, "_fastqc_data.txt"), 'w') as fastqc_sample_name: for line in fastqc_bam_name: fastqc_sample_name.write(line.replace(os.path.basename(bam_file), fastqc_clean_name)) shutil.move(os.path.join(tx_fastqc_out, "_fastqc_data.txt"), os.path.join(fastqc_out, 'fastqc_data.txt')) shutil.move(tx_combo_file, sentry_file) if os.path.exists("%s.zip" % tx_fastqc_out): shutil.move("%s.zip" % tx_fastqc_out, os.path.join(fastqc_out, "%s.zip" % fastqc_clean_name)) elif not os.path.exists(sentry_file): raise ValueError("FastQC failed to produce output HTML file: %s" % os.listdir(tx_tmp_dir)) logger.info("Produced HTML report %s" % sentry_file) parser = FastQCParser(fastqc_out, dd.get_sample_name(data)) stats = parser.get_fastqc_summary() parser.save_sections_into_file() return stats
def prepare_sample(data): """Prepare a sample to be run, potentially converting from BAM to FASTQ and/or downsampling the number of reads for a test run """ NUM_DOWNSAMPLE = 10000 logger.debug("Preparing %s" % data["rgnames"]["sample"]) file1, file2 = get_fastq_files(data) if data.get("test_run", False): if bam.is_bam(file1): file1 = bam.downsample(file1, data, NUM_DOWNSAMPLE) file2 = None else: file1, file2 = fastq.downsample(file1, file2, data, NUM_DOWNSAMPLE, quick=True) data["files"] = [file1, file2] return [[data]]
def _run_fastqc(bam_file, data, fastqc_out): """Run fastqc, generating report in specified directory and parsing metrics. Downsamples to 10 million reads to avoid excessive processing times with large files, unless we're running a Standard/smallRNA-seq/QC pipeline. Handles fastqc 0.11+, which use a single HTML file and older versions that use a directory of files + images. The goal is to eventually move to only 0.11+ """ sentry_file = os.path.join(fastqc_out, "fastqc_report.html") if not os.path.exists(sentry_file): work_dir = os.path.dirname(fastqc_out) utils.safe_makedir(work_dir) ds_bam = (bam.downsample(bam_file, data, 1e7) if data.get( "analysis", "").lower() not in ["standard", "smallrna-seq"] else None) bam_file = ds_bam if ds_bam else bam_file frmt = "bam" if bam_file.endswith("bam") else "fastq" fastqc_name = utils.splitext_plus(os.path.basename(bam_file))[0] num_cores = data["config"]["algorithm"].get("num_cores", 1) with tx_tmpdir(data, work_dir) as tx_tmp_dir: with utils.chdir(tx_tmp_dir): cl = [ config_utils.get_program("fastqc", data["config"]), "-t", str(num_cores), "--extract", "-o", tx_tmp_dir, "-f", frmt, bam_file ] do.run(cl, "FastQC: %s" % data["name"][-1]) tx_fastqc_out = os.path.join(tx_tmp_dir, "%s_fastqc" % fastqc_name) tx_combo_file = os.path.join(tx_tmp_dir, "%s_fastqc.html" % fastqc_name) if os.path.exists("%s.zip" % tx_fastqc_out): os.remove("%s.zip" % tx_fastqc_out) if not os.path.exists(sentry_file) and os.path.exists( tx_combo_file): utils.safe_makedir(fastqc_out) shutil.move(os.path.join(tx_fastqc_out, "fastqc_data.txt"), fastqc_out) shutil.move(tx_combo_file, sentry_file) elif not os.path.exists(sentry_file): if os.path.exists(fastqc_out): shutil.rmtree(fastqc_out) shutil.move(tx_fastqc_out, fastqc_out) parser = FastQCParser(fastqc_out, data["name"][-1]) stats = parser.get_fastqc_summary() parser.save_sections_into_file() return stats
def _run_fastqc(bam_file, data, fastqc_out): """Run fastqc, generating report in specified directory and parsing metrics. Downsamples to 10 million reads to avoid excessive processing times with large files, unless we're running a Standard/QC pipeline. Handles fastqc 0.11+, which use a single HTML file and older versions that use a directory of files + images. The goal is to eventually move to only 0.11+ """ sentry_file = os.path.join(fastqc_out, "fastqc_report.html") if not os.path.exists(sentry_file): work_dir = os.path.dirname(fastqc_out) utils.safe_makedir(work_dir) ds_bam = bam.downsample(bam_file, data, 1e7) if data.get("analysis", "").lower() not in ["standard"] else None bam_file = ds_bam if ds_bam else bam_file fastqc_name = os.path.splitext(os.path.basename(bam_file))[0] num_cores = data["config"]["algorithm"].get("num_cores", 1) with tx_tmpdir(data, work_dir) as tx_tmp_dir: with utils.chdir(tx_tmp_dir): cl = [ config_utils.get_program("fastqc", data["config"]), "-t", str(num_cores), "--extract", "-o", tx_tmp_dir, "-f", "bam", bam_file, ] do.run(cl, "FastQC: %s" % data["name"][-1]) tx_fastqc_out = os.path.join(tx_tmp_dir, "%s_fastqc" % fastqc_name) tx_combo_file = os.path.join(tx_tmp_dir, "%s_fastqc.html" % fastqc_name) if os.path.exists("%s.zip" % tx_fastqc_out): os.remove("%s.zip" % tx_fastqc_out) if not os.path.exists(sentry_file) and os.path.exists(tx_combo_file): utils.safe_makedir(fastqc_out) shutil.copy(os.path.join(tx_fastqc_out, "fastqc_data.txt"), fastqc_out) shutil.move(tx_combo_file, sentry_file) elif not os.path.exists(sentry_file): if os.path.exists(fastqc_out): shutil.rmtree(fastqc_out) shutil.move(tx_fastqc_out, fastqc_out) if ds_bam and os.path.exists(ds_bam): os.remove(ds_bam) parser = FastQCParser(fastqc_out) stats = parser.get_fastqc_summary() return stats
def process_lane(item): """Prepare lanes, potentially splitting based on barcodes and reducing the number of reads for a test run """ NUM_DOWNSAMPLE = 10000 logger.debug("Preparing %s" % item["rgnames"]["lane"]) file1, file2 = get_fastq_files(item) if item.get("test_run", False): if bam.is_bam(file1): file1 = bam.downsample(file1, item, NUM_DOWNSAMPLE) file2 = None else: file1, file2 = fastq.downsample(file1, file2, item, NUM_DOWNSAMPLE, quick=True) item["files"] = [file1, file2] return [[item]]
def _run_qualimap(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ report_file = os.path.join(out_dir, "qualimapReport.html") if not os.path.exists(report_file): ds_bam = bam.downsample(bam_file, data, 1e7) bam_file = ds_bam if ds_bam else bam_file utils.safe_makedir(out_dir) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) resources = config_utils.get_resources("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) cmd = ("unset DISPLAY && {qualimap} bamqc -bam {bam_file} -outdir {out_dir} " "-nt {num_cores} --java-mem-size={max_mem}") species = data["genome_resources"]["aliases"].get("ensembl", "").upper() if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = bedutils.merge_overlaps(dd.get_variant_regions(data), data) if regions: bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" do.run(cmd.format(**locals()), "Qualimap: %s" % data["name"][-1]) return _parse_qualimap_metrics(report_file)