def make_general_reports(view, samples, target, genome, depth_threshs, bed_padding, num_pairs_by_sample=None, reuse=False, is_debug=False, reannotate=False, fai_fpath=None): if all(all(can_reuse(fp, [s.bam, target.qualimap_bed_fpath] if target.bed else s.bam) for fp in _qualimap_outputs(s)) for s in samples): debug('All QualiMap files for all samples exist and newer than BAMs and BEDs, reusing') else: info('Running QualiMap...') view.run(runner.run_qualimap, [[s.work_dir, s.qualimap_dirpath, _qualimap_outputs(s), s.bam, genome, target.qualimap_bed_fpath, view.cores_per_job] for s in samples]) for s in samples: for fp in _qualimap_outputs(s): verify_file(fp, is_critical=True) summary_reports = [] for sample in samples: info('-'*70) info(sample.name) debug('-'*70) debug('Parsing QualiMap results...') depth_stats, reads_stats, indels_stats, target_stats = parse_qualimap_results(sample) _prep_report_data(sample, depth_stats, reads_stats, indels_stats, target_stats, target, num_pairs_by_sample, genome, depth_threshs, fai_fpath=fai_fpath) r = _build_report(depth_stats, reads_stats, indels_stats, sample, target, depth_threshs, bed_padding, sample_num=len(samples), is_debug=is_debug, reannotate=reannotate) summary_reports.append(r) return summary_reports
def combined_regional_reports(work_dir, output_dir, samples): if not any(verify_file(s.targqc_region_tsv, silent=True) for s in samples): return None, None tsv_region_rep_fpath = join(output_dir, basename(samples[0].targqc_region_tsv)) debug('Combining regional reports, writing to ' + tsv_region_rep_fpath) with file_transaction(work_dir, tsv_region_rep_fpath) as tx_tsv: with open(tx_tsv, 'w') as tsv_out: # sample_i = 0 # for s in samples: # if s.targqc_region_txt and verify_file(s.targqc_region_txt): # with open(s.targqc_region_txt) as txt_in: # for l in txt_in: # if l.startswith('#'): # if not l.startswith('##') and sample_i == 0: # txt_out.write('#Sample' + ' '*(max(len('#Sample'), len(s.name)) - len('#Sample')) + ' ' + l.replace('#Chr', 'Chr ')) # else: # txt_out.write(s.name + ' '*(max(len('#Sample'), len(s.name)) - len(s.name)) + ' ' + l) # sample_i += 1 sample_i = 0 for s in samples: if s.targqc_region_tsv and verify_file(s.targqc_region_tsv): with open(s.targqc_region_tsv) as tsv_in: for i, l in enumerate(tsv_in): if i == 0: if sample_i == 0: tsv_out.write('sample\t' + l) else: tsv_out.write(s.name + '\t' + l) sample_i += 1 return tsv_region_rep_fpath
def _correct_qualimap_insert_size_histogram(samples): """ replacing Qualimap insert size histogram with Picard one. """ for s in samples: qualimap1_dirname = dirname(s.qualimap_ins_size_hist_fpath).replace('raw_data_qualimapReport', 'raw_data') qualimap2_dirname = dirname(s.qualimap_ins_size_hist_fpath) if exists(qualimap1_dirname): if not exists(qualimap2_dirname): shutil.move(qualimap1_dirname, qualimap2_dirname) else: shutil.rmtree(qualimap1_dirname) elif not exists(qualimap2_dirname): continue # no data from both Qualimap v.1 and Qualimap v.2 # if qualimap histogram exits and reuse_intermediate, skip if verify_file(s.qualimap_ins_size_hist_fpath, silent=True) and cfg.reuse_intermediate: pass else: if verify_file(s.picard_ins_size_hist_txt_fpath): with open(s.picard_ins_size_hist_txt_fpath, 'r') as picard_f: one_line_to_stop = False for line in picard_f: if one_line_to_stop: break if line.startswith('## HISTOGRAM'): one_line_to_stop = True with file_transaction(None, s.qualimap_ins_size_hist_fpath) as tx: with open(tx, 'w') as qualimap_f: for line in picard_f: qualimap_f.write(line)
def read_biomart(genome_name): features_by_ens_id = dict() bm_fpath = ebl.biomart_fpath(genome_name) if not verify_file(bm_fpath): warn('Warning: biomart file for genome ' + genome_name + ' not found, skip using the TSL values') return dict() with open(bm_fpath) as f: for r in csv.DictReader(f, delimiter='\t'): features_by_ens_id[r['Transcript ID']] = r # hg38 version has TSL, checking if we can populate some TSL from it if not genome_name.startswith('hg38'): bm_fpath = ebl.biomart_fpath('hg38') if not verify_file(bm_fpath): critical( 'Biomart for hg38 file not found, and needed for TSL values') with open(bm_fpath) as f: for r in csv.DictReader(f, delimiter='\t'): if r['Transcript ID'] not in features_by_ens_id: features_by_ens_id[r['Transcript ID']] = r else: features_by_ens_id[r['Transcript ID']][ 'Transcript Support Level (TSL)'] = r[ 'Transcript Support Level (TSL)'] return features_by_ens_id
def sort_bed_gsort(input_bed_fpath, output_bed_fpath=None, work_dir=None, fai_fpath=None, genome=None): input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True) output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \ else intermediate_fname(work_dir, input_bed_fpath, 'sorted') debug('Sorting regions in ' + str(input_bed_fpath)) if can_reuse(output_bed_fpath, input_bed_fpath): debug(output_bed_fpath + ' exists, reusing') return output_bed_fpath if fai_fpath: fai_fpath = verify_file(fai_fpath) elif genome: fai_fpath = verify_file(ref.get_fai(genome)) else: critical('Either of fai_fpath or genome build name must be specified') with file_transaction(work_dir, output_bed_fpath) as tx: run('gsort {input_bed_fpath} {fai_fpath}'.format(**locals()), output_fpath=tx) return output_bed_fpath
def make_tarqc_html_report(output_dir, work_dir, samples, bed_fpath=None, tag_by_sample=None): # header_storage = get_header_metric_storage(tc.depth_thresholds, # is_wgs=bed_fpath is not None, # padding=tc.padding) jsons_by_sample = {s.name: s.targqc_json_fpath for s in samples if verify_file(s.targqc_json_fpath)} # htmls_by_sample = {s.name: s.targqc_html_fpath for s in samples if verify_file(s.targqc_html_fpath)} htmls_by_sample = dict() if not jsons_by_sample: return None, None, None targqc_full_report = FullReport.construct_from_sample_report_jsons(samples, output_dir, jsons_by_sample, htmls_by_sample) for sample_report in targqc_full_report.sample_reports: if tag_by_sample: sample_report.set_project_tag(tag_by_sample[sample_report.sample.name]) if verify_file(sample_report.sample.qualimap_html_fpath): url = relpath(sample_report.sample.qualimap_html_fpath, output_dir) r = sample_report.find_record(sample_report.records, 'Qualimap') if r: r.url = url else: sample_report.add_record(metric_name='Qualimap', value='Qualimap', url=url, silent=True) if len(samples) > 1: run_multisample_qualimap(output_dir, work_dir, samples, targqc_full_report) fn = splitext(basename(samples[0].targqc_txt_fpath))[0] tsv_fpath = targqc_full_report.save_tsv(join(output_dir, fn + '.tsv')) html_fpath = targqc_full_report.save_html(join(output_dir, fn + '.html'), 'TargQC') return tsv_fpath, html_fpath
def _correct_qualimap_insert_size_histogram(work_dir, samples): """ replacing Qualimap insert size histogram with Picard one. """ for s in samples: qualimap1_dirname = dirname(s.qualimap_ins_size_hist_fpath).replace( 'raw_data_qualimapReport', 'raw_data') qualimap2_dirname = dirname(s.qualimap_ins_size_hist_fpath) if exists(qualimap1_dirname): if not exists(qualimap2_dirname): shutil.move(qualimap1_dirname, qualimap2_dirname) else: shutil.rmtree(qualimap1_dirname) elif not exists(qualimap2_dirname): continue # no data from both Qualimap v.1 and Qualimap v.2 # if qualimap histogram exits and reuse_intermediate, skip if verify_file(s.qualimap_ins_size_hist_fpath, silent=True) and tc.reuse_intermediate: pass else: if verify_file(s.picard_ins_size_hist_txt_fpath): with open(s.picard_ins_size_hist_txt_fpath, 'r') as picard_f: one_line_to_stop = False for line in picard_f: if one_line_to_stop: break if line.startswith('## HISTOGRAM'): one_line_to_stop = True with file_transaction( work_dir, s.qualimap_ins_size_hist_fpath) as tx: with open(tx, 'w') as qualimap_f: for line in picard_f: qualimap_f.write(line)
def load_yaml_config(fpath): verify_file(fpath, is_critical=True) try: dic = load_yaml(open(fpath)) except Exception: err(format_exc()) critical('Could not parse bcbio YAML ' + fpath) else: return dic
def sort_bed(input_bed_fpath, output_bed_fpath=None, work_dir=None, fai_fpath=None, chr_order=None, genome=None): input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True) output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \ else intermediate_fname(work_dir, input_bed_fpath, 'sorted') debug('Sorting regions in ' + str(input_bed_fpath)) if can_reuse(output_bed_fpath, input_bed_fpath): debug(output_bed_fpath + ' exists, reusing') return output_bed_fpath regions = [] if not chr_order: if fai_fpath: fai_fpath = verify_file(fai_fpath) elif genome: fai_fpath = verify_file(ref.get_fai(genome)) else: critical( 'Either of chr_order, fai_fpath, or genome build name must be specified' ) chr_order = get_chrom_order(fai_fpath=fai_fpath) with open(input_bed_fpath) as f: with file_transaction(work_dir, output_bed_fpath) as tx: with open(tx, 'w') as out: for l in f: if not l.strip(): continue if l.strip().startswith('#'): out.write(l) continue fs = l.strip().split('\t') chrom = fs[0] start = int(fs[1]) end = int(fs[2]) other_fields = fs[3:] order = chr_order.get(chrom, -1) regions.append( Region(chrom, start, end, other_fields, order)) for region in sorted(regions, key=lambda r: r.get_key()): fs = [region.chrom, str(region.start), str(region.end)] fs.extend(region.other_fields) out.write('\t'.join(fs) + '\n') debug('Sorted ' + str(len(regions)) + ' regions, saved to ' + output_bed_fpath) return output_bed_fpath
def run(cmd, output_fpath=None, input_fpath=None, checks=None, stdout_to_outputfile=True, stdout_tx=True, reuse=False, env_vars=None): """Run the provided command, logging details and checking for errors. """ if output_fpath and reuse: if verify_file(output_fpath, silent=True): info(output_fpath + ' exists, reusing') return output_fpath if not output_fpath.endswith('.gz') and verify_file(output_fpath + '.gz', silent=True): info(output_fpath + '.gz exists, reusing') return output_fpath env = os.environ.copy() if env_vars: for k, v in env_vars.items(): if v is None: if k in env: del env[k] else: env[k] = v if checks is None: checks = [file_nonempty_check] def _try_run(_cmd, _output_fpath, _input_fpath): try: info(' '.join(str(x) for x in _cmd) if not isinstance(_cmd, six.string_types) else _cmd) _do_run(_cmd, checks, env, _output_fpath, _input_fpath) except: raise if output_fpath: if isfile(output_fpath): os.remove(output_fpath) if output_fpath: if stdout_tx: with file_transaction(None, output_fpath) as tx_out_file: if stdout_to_outputfile: cmd += ' > ' + tx_out_file else: cmd += '\n' cmd = cmd.replace(' ' + output_fpath + ' ', ' ' + tx_out_file + ' ') \ .replace(' "' + output_fpath + '" ', ' ' + tx_out_file + '" ') \ .replace(' \'' + output_fpath + '\' ', ' ' + tx_out_file + '\' ') \ .replace(' ' + output_fpath + '\n', ' ' + tx_out_file) \ .replace(' "' + output_fpath + '"\n', ' ' + tx_out_file + '"') \ .replace(' \'' + output_fpath + '\'\n', ' ' + tx_out_file + '\'') \ .replace('\n', '') _try_run(cmd, tx_out_file, input_fpath) else: _try_run(cmd, output_fpath, input_fpath) else: _try_run(cmd, None, input_fpath)
def _make_padded_bed(self, work_dir, fai_fpath, padding): if self.is_wgs: return None self.padded_bed_fpath = intermediate_fname(work_dir, self.capture_bed_fpath, 'padded') if can_reuse(self.padded_bed_fpath, self.capture_bed_fpath): return BedTool(self.padded_bed_fpath) padded_bed = self.bed.slop(b=padding, g=fai_fpath).sort().merge() with file_transaction(work_dir, self.padded_bed_fpath) as tx: padded_bed.saveas(tx) verify_file(self.padded_bed_fpath, is_critical=True) return BedTool(self.padded_bed_fpath)
def _correct_qualimap_genome_results(samples): """ fixing java.lang.Double.parseDouble error on entries like "6,082.49" """ for s in samples: if verify_file(s.qualimap_genome_results_fpath): correction_is_needed = False with open(s.qualimap_genome_results_fpath, 'r') as f: content = f.readlines() metrics_started = False for line in content: if ">> Reference" in line: metrics_started = True if metrics_started: if line.find(',') != -1: correction_is_needed = True break if correction_is_needed: with open(s.qualimap_genome_results_fpath, 'w') as f: metrics_started = False for line in content: if ">> Reference" in line: metrics_started = True if metrics_started: if line.find(',') != -1: line = line.replace(',', '') f.write(line)
def get_chrom_lengths(genome=None, fai_fpath=None): assert genome or fai_fpath, 'One of genome or fai_fpath should be not None: ' \ 'genome=' + str(genome) + ' fai_fpath=' + str(fai_fpath) if not fai_fpath: check_genome(genome) fai_fpath = get_fai(genome) else: fai_fpath = verify_file(fai_fpath, is_critical=True) if not fai_fpath.endswith('.fai') and not fai_fpath.endswith('.fa'): critical('Error: .fai or .fa is accepted.') chr_lengths = [] if fai_fpath.endswith('.fa'): debug('Reading genome sequence (.fa) to get chromosome lengths') with open(fai_fpath, 'r') as handle: from Bio import SeqIO reference_records = SeqIO.parse(handle, 'fasta') for record in reference_records: chrom = record.id chr_lengths.append((chrom, len(record.seq))) else: debug('Reading genome index file (.fai) to get chromosome lengths') with open(fai_fpath, 'r') as handle: for line in handle: line = line.strip() if line: chrom, length = line.split()[0], int(line.split()[1]) chr_lengths.append((chrom, length)) return chr_lengths
def read_samples(args): bam_by_sample = find_bams(args) if bam_by_sample: info('Found ' + str(len(bam_by_sample)) + ' BAM file' + ('s' if len(bam_by_sample) > 1 else '')) input_not_bam = [ verify_file(fpath) for fpath in args if adjust_path(fpath) not in bam_by_sample ] input_not_bam = [fpath for fpath in input_not_bam if fpath] fastqs_by_sample = dict() if not input_not_bam and not bam_by_sample: critical('No correct input files') if input_not_bam: info('Input ' + str(len(input_not_bam)) + ' correct input non-BAM files') fastqs_by_sample = find_fastq_pairs(input_not_bam) if fastqs_by_sample: info('Found ' + str(len(fastqs_by_sample)) + ' FastQ pairs') intersection = set(fastqs_by_sample.keys()) & set(bam_by_sample.keys()) if intersection: critical('The following samples both had input BAMs and FastQ: ' + ', '.join(list(intersection))) return fastqs_by_sample, bam_by_sample
def make_tarqc_html_report(output_dir, work_dir, samples, bed_fpath=None, tag_by_sample=None): # header_storage = get_header_metric_storage(tc.depth_thresholds, # is_wgs=bed_fpath is not None, # padding=tc.padding) jsons_by_sample = { s.name: s.targqc_json_fpath for s in samples if verify_file(s.targqc_json_fpath) } # htmls_by_sample = {s.name: s.targqc_html_fpath for s in samples if verify_file(s.targqc_html_fpath)} htmls_by_sample = dict() if not jsons_by_sample: return None, None, None targqc_full_report = FullReport.construct_from_sample_report_jsons( samples, output_dir, jsons_by_sample, htmls_by_sample) for sample_report in targqc_full_report.sample_reports: if tag_by_sample: sample_report.set_project_tag( tag_by_sample[sample_report.sample.name]) if verify_file(sample_report.sample.qualimap_html_fpath): url = relpath(sample_report.sample.qualimap_html_fpath, output_dir) r = sample_report.find_record(sample_report.records, 'Qualimap') if r: r.url = url else: sample_report.add_record(metric_name='Qualimap', value='Qualimap', url=url, silent=True) if len(samples) > 1: run_multisample_qualimap(output_dir, work_dir, samples, targqc_full_report) fn = splitext(basename(samples[0].targqc_txt_fpath))[0] tsv_fpath = targqc_full_report.save_tsv(join(output_dir, fn + '.tsv')) html_fpath = targqc_full_report.save_html(join(output_dir, fn + '.html'), 'TargQC') return tsv_fpath, html_fpath
def _check_dir_not_empty(dirpath, description=None): assert verify_dir(dirpath, description=description), dirpath contents = [join(dirpath, fname) for fname in os.listdir(dirpath) if not fname.startswith('.')] assert len(contents) >= 1, dirpath + ': ' + str(contents) assert all(verify_file(realpath(fpath), is_critical=True) for fpath in contents if isfile(realpath(fpath))), dirpath + ': ' + str(contents)
def _make_qualimap_bed(self, work_dir): if self.is_wgs: return None self.qualimap_bed_fpath = intermediate_fname(work_dir, self.capture_bed_fpath, 'qualimap_ready') if can_reuse(self.qualimap_bed_fpath, self.capture_bed_fpath): return self.qualimap_bed_fpath debug('Merging and saving BED into required bed6 format for Qualimap') bed = self.bed.sort().merge() with file_transaction(work_dir, self.qualimap_bed_fpath) as tx: with open(tx, 'w') as out: for i, region in enumerate(x for x in bed): region = [x for x in list(region) if x] fillers = [str(i), "1.0", "+"] full = region + fillers[:6 - len(region)] out.write("\t".join(full) + "\n") verify_file(self.qualimap_bed_fpath, is_critical=True) return self.qualimap_bed_fpath
def run_qualimap(work_dir, output_dir, output_fpaths, bam_fpath, genome, bed_fpath=None, threads=1): info('Analysing ' + bam_fpath) safe_mkdir(dirname(output_dir)) safe_mkdir(output_dir) mem_cmdl = '' mem_m = get_qualimap_max_mem(bam_fpath) mem = str(int(mem_m)) + 'M' mem_cmdl = '--java-mem-size=' + mem cmdline = (find_executable() + ' bamqc --skip-duplicated -nt {threads} {mem_cmdl} -nr 5000 ' '-bam {bam_fpath} -outdir {output_dir}') if genome.startswith('hg') or genome.startswith('GRCh'): cmdline += ' -gd HUMAN' if genome.startswith('mm'): cmdline += ' -gd MOUSE' if bed_fpath: cmdline += ' -gff {bed_fpath}' debug('Using amplicons/capture panel ' + bed_fpath) cmdline = cmdline.format(**locals()) if not all( can_reuse(fp, [bam_fpath, bed_fpath] if bed_fpath else [bam_fpath]) for fp in output_fpaths): for fp in output_fpaths: if isfile(fp): os.remove(fp) try: run(cmdline, env_vars=dict(DISPLAY=None)) except subprocess.CalledProcessError as e: if 'The alignment file is unsorted.' in e.output: info() info('BAM file is unsorted; trying to sort and rerun QualiMap') sorted_bam_fpath = sort_bam(bam_fpath) cmdline = cmdline.replace(bam_fpath, sorted_bam_fpath) run(cmdline, env_vars=dict(DISPLAY=None)) if not all( verify_file( fp, cmp_f=[bam_fpath, bed_fpath] if bed_fpath else [bam_fpath]) for fp in output_fpaths): critical('Some of the QualiMap results were not generated') return output_dir
def merge_overlaps(work_dir, bed_fpath, distance=None): """Merge bed file intervals to avoid overlapping regions. Overlapping regions (1:1-100, 1:90-100) cause issues with callers like FreeBayes that don't collapse BEDs prior to using them. """ output_fpath = intermediate_fname(work_dir, bed_fpath, 'merged') if isfile(output_fpath) and verify_file(output_fpath, cmp_f=bed_fpath): return output_fpath with file_transaction(work_dir, output_fpath) as tx: kwargs = dict(d=distance) if distance else dict() BedTool(bed_fpath).merge(**kwargs).saveas(tx) return output_fpath
def read_biomart(genome_name): features_by_ens_id = dict() bm_fpath = ebl.biomart_fpath(genome_name) if not verify_file(bm_fpath): warn('Warning: biomart file for genome ' + genome_name + ' not found, skip using the TSL values') return dict() with open(bm_fpath) as f: for r in csv.DictReader(f, delimiter='\t'): features_by_ens_id[r['Transcript ID']] = r # hg38 version has TSL, checking if we can populate some TSL from it if not genome_name.startswith('hg38'): bm_fpath = ebl.biomart_fpath('hg38') if not verify_file(bm_fpath): critical('Biomart for hg38 file not found, and needed for TSL values') with open(bm_fpath) as f: for r in csv.DictReader(f, delimiter='\t'): if r['Transcript ID'] not in features_by_ens_id: features_by_ens_id[r['Transcript ID']] = r else: features_by_ens_id[r['Transcript ID']]['Transcript Support Level (TSL)'] = r[ 'Transcript Support Level (TSL)'] return features_by_ens_id
def bam_to_bed_nocnf(bam_fpath, bedtools='bedtools', gzip='gzip'): info( 'Converting the BAM to BED to save some memory.' ) # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/ bam_bed_fpath = splitext_plus(bam_fpath)[0] + '.bed.gz' cmdline = '{bedtools} bamtobed -i {bam_fpath} | {gzip} > {bam_bed_fpath}'.format( **locals()) info(cmdline) os.system(cmdline) bam_bed_fpath = verify_file(bam_bed_fpath) if bam_bed_fpath: info('Done, saved to ' + bam_bed_fpath) else: err('Error, result is non-existent or empty') return bam_bed_fpath
def verify_bam(fpath, description='', is_critical=False, silent=False): if not verify_file(fpath, description, is_critical=is_critical, silent=silent): return None fpath = adjust_path(fpath) logfn = critical if is_critical else err if not fpath.endswith('.bam'): logfn('The file ' + fpath + ' is supposed to be BAM but does not have the .bam ' 'extension. Please, make sure you pass proper file.') return None # TODO: check if binary return fpath
def get_canonical_transcripts_ids(genome): short_genome = genome.split('-')[0] if short_genome.startswith('GRCh37'): short_genome = 'hg19' if short_genome.startswith('GRCh38'): short_genome = 'hg38' check_genome(short_genome) canon_fpath = _get(join('{genome}', 'canon_transcripts_{genome}_ensembl.txt'), genome) replacement_fpath = _get('canon_cancer_replacement.txt') canon_fpath = verify_file(canon_fpath, description='Canonical transcripts path') replacement_fpath = verify_file(replacement_fpath, description='Canonical cancer transcripts replacement path') if not canon_fpath: return None with open(canon_fpath) as f: canon_tx_by_gname = dict(l.strip('\n').split('\t') for l in f) if replacement_fpath: with open(replacement_fpath) as f: for gname, tx_id in (l.strip('\n').split('\t') for l in f): canon_tx_by_gname[gname] = tx_id return canon_tx_by_gname
def run_multisample_qualimap(output_dir, work_dir, samples, targqc_full_report): """ 1. Generates Qualimap2 plots and put into plots_dirpath 2. Adds records to targqc_full_report.plots """ plots_dirpath = join(output_dir, 'plots') individual_report_fpaths = [s.qualimap_html_fpath for s in samples] if isdir(plots_dirpath) and not any( not can_reuse(join(plots_dirpath, f), individual_report_fpaths) for f in listdir(plots_dirpath) if not f.startswith('.')): debug('Qualimap miltisample plots exist - ' + plots_dirpath + ', reusing...') else: # Qualimap2 run for multi-sample plots if len([s.qualimap_html_fpath for s in samples if s.qualimap_html_fpath]) > 0: if find_executable() is not None: # and get_qualimap_type(find_executable()) == 'full': qualimap_output_dir = join(work_dir, 'qualimap_multi_bamqc') _correct_qualimap_genome_results(samples) _correct_qualimap_insert_size_histogram(samples) safe_mkdir(qualimap_output_dir) rows = [] for sample in samples: if sample.qualimap_html_fpath: rows += [[sample.name, sample.qualimap_html_fpath]] data_fpath = write_tsv_rows(([], rows), join(qualimap_output_dir, 'qualimap_results_by_sample.tsv')) qualimap_plots_dirpath = join(qualimap_output_dir, 'images_multisampleBamQcReport') cmdline = find_executable() + ' multi-bamqc --data {data_fpath} -outdir {qualimap_output_dir}'.format(**locals()) run(cmdline, env_vars=dict(DISPLAY=None), checks=[lambda _1, _2: verify_dir(qualimap_output_dir)], reuse=cfg.reuse_intermediate) if not verify_dir(qualimap_plots_dirpath): warn('Warning: Qualimap for multi-sample analysis failed to finish. TargQC will not contain plots.') return None else: if exists(plots_dirpath): shutil.rmtree(plots_dirpath) shutil.move(qualimap_plots_dirpath, plots_dirpath) else: warn('Warning: Qualimap for multi-sample analysis was not found. TargQC will not contain plots.') return None targqc_full_report.plots = [] for plot_fpath in listdir(plots_dirpath): plot_fpath = join(plots_dirpath, plot_fpath) if verify_file(plot_fpath) and plot_fpath.endswith('.png'): targqc_full_report.plots.append(relpath(plot_fpath, output_dir))
def verify_bed(bed, description='', is_critical=False, silent=False): if isinstance(bed, BedTool): return bed fpath = adjust_path(bed) if not verify_file( fpath, description, is_critical=is_critical, silent=silent): return None error = BedFile(fpath).checkformat() if error: fn = critical if is_critical else err fn('Error: incorrect bed file format (' + fpath + '): ' + str(error) + '\n') return None return fpath
def verify_bam(fpath, description='', is_critical=False, silent=False): if not verify_file( fpath, description, is_critical=is_critical, silent=silent): return None fpath = adjust_path(fpath) logfn = critical if is_critical else err if not fpath.endswith('.bam'): logfn('The file ' + fpath + ' is supposed to be BAM but does not have the .bam ' 'extension. Please, make sure you pass proper file.') return None # TODO: check if binary return fpath
def run_qualimap(work_dir, output_dir, output_fpaths, bam_fpath, genome, bed_fpath=None, threads=1): info('Analysing ' + bam_fpath) safe_mkdir(dirname(output_dir)) safe_mkdir(output_dir) mem_cmdl = '' mem_m = get_qualimap_max_mem(bam_fpath) mem = str(int(mem_m)) + 'M' mem_cmdl = '--java-mem-size=' + mem cmdline = (find_executable() + ' bamqc --skip-duplicated -nt {threads} {mem_cmdl} -nr 5000 ' '-bam {bam_fpath} -outdir {output_dir}') if genome.startswith('hg') or genome.startswith('GRCh'): cmdline += ' -gd HUMAN' if genome.startswith('mm'): cmdline += ' -gd MOUSE' if bed_fpath: cmdline += ' -gff {bed_fpath}' debug('Using amplicons/capture panel ' + bed_fpath) cmdline = cmdline.format(**locals()) if not all(can_reuse(fp, [bam_fpath, bed_fpath] if bed_fpath else [bam_fpath]) for fp in output_fpaths): for fp in output_fpaths: if isfile(fp): os.remove(fp) try: run(cmdline, env_vars=dict(DISPLAY=None)) except subprocess.CalledProcessError as e: if 'The alignment file is unsorted.' in e.output: info() info('BAM file is unsorted; trying to sort and rerun QualiMap') sorted_bam_fpath = sort_bam(bam_fpath) cmdline = cmdline.replace(bam_fpath, sorted_bam_fpath) run(cmdline, env_vars=dict(DISPLAY=None)) if not all(verify_file(fp, cmp_f=[bam_fpath, bed_fpath] if bed_fpath else [bam_fpath]) for fp in output_fpaths): critical('Some of the QualiMap results were not generated') return output_dir
def _make_target_bed(self, bed_fpath, work_dir, output_dir, is_debug, padding=None, fai_fpath=None, genome=None, reannotate=False): clean_target_bed_fpath = intermediate_fname(work_dir, bed_fpath, 'clean') if not can_reuse(clean_target_bed_fpath, bed_fpath): debug() debug('Cleaning target BED file...') bed = BedTool(bed_fpath) if bed.field_count() > 4: bed = bed.cut(range(4)) bed = bed\ .filter(lambda x: x.chrom and not any(x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser']))\ .remove_invalid() with file_transaction(work_dir, clean_target_bed_fpath) as tx: bed.saveas(tx) debug('Saved to ' + clean_target_bed_fpath) verify_file(clean_target_bed_fpath, is_critical=True) sort_target_bed_fpath = intermediate_fname(work_dir, clean_target_bed_fpath, 'sorted') if not can_reuse(sort_target_bed_fpath, clean_target_bed_fpath): debug() debug('Sorting target BED file...') sort_target_bed_fpath = sort_bed(clean_target_bed_fpath, output_bed_fpath=sort_target_bed_fpath, fai_fpath=fai_fpath) debug('Saved to ' + sort_target_bed_fpath) verify_file(sort_target_bed_fpath, is_critical=True) if genome in ebl.SUPPORTED_GENOMES: ann_target_bed_fpath = intermediate_fname(work_dir, sort_target_bed_fpath, 'ann_plus_features') if not can_reuse(ann_target_bed_fpath, sort_target_bed_fpath): debug() if BedTool(sort_target_bed_fpath).field_count() == 3 or reannotate: debug('Annotating target BED file and collecting overlapping genome features') overlap_with_features(sort_target_bed_fpath, ann_target_bed_fpath, work_dir=work_dir, genome=genome, extended=True, reannotate=reannotate, only_canonical=True) else: debug('Overlapping with genomic features:') overlap_with_features(sort_target_bed_fpath, ann_target_bed_fpath, work_dir=work_dir, genome=genome, extended=True, only_canonical=True) debug('Saved to ' + ann_target_bed_fpath) verify_file(ann_target_bed_fpath, is_critical=True) else: ann_target_bed_fpath = sort_target_bed_fpath final_clean_target_bed_fpath = intermediate_fname(work_dir, ann_target_bed_fpath, 'clean') if not can_reuse(final_clean_target_bed_fpath, ann_target_bed_fpath): bed = BedTool(ann_target_bed_fpath).remove_invalid() with file_transaction(work_dir, final_clean_target_bed_fpath) as tx: bed.saveas(tx) pass verify_file(final_clean_target_bed_fpath, is_critical=True) self.bed_fpath = final_clean_target_bed_fpath self.bed = BedTool(self.bed_fpath) self.capture_bed_fpath = add_suffix(join(output_dir, basename(bed_fpath)), 'clean_sorted_ann') if not can_reuse(self.capture_bed_fpath, self.bed_fpath): with file_transaction(work_dir, self.capture_bed_fpath) as tx: self.get_capture_bed().saveas(tx) gene_key_set, gene_key_list = get_genes_from_bed(bed_fpath) self.gene_keys_set = gene_key_set self.gene_keys_list = gene_key_list self.regions_num = self.get_capture_bed().count() self._make_qualimap_bed(work_dir) if padding: self._make_padded_bed(work_dir, fai_fpath, padding)
def _save_best_details_for_each_gene(depth_threshs, samples, output_dir): metric_storage = get_detailed_metric_storage(depth_threshs) report = PerRegionSampleReport(sample='Best', metric_storage=metric_storage) report.add_record('Sample', 'contains best values from all samples: ' + ', '.join([s.name for s in samples])) total_regions = 0 fpaths = [s.targqc_region_tsv for s in samples if verify_file(s.targqc_region_tsv)] if not fpaths: err('No targetcov detailed per-gene report was generated; skipping.') return None open_tsv_files = [open(fpath) for fpath in fpaths] first_col = 0 while True: lines_for_each_sample = [next(f, None) for f in open_tsv_files] if not all(lines_for_each_sample): break l = lines_for_each_sample[0] if l.startswith('##'): continue elif l.startswith('#'): if l.startswith('#Sample'): first_col = 1 break while True: lines_for_each_sample = [next(f, None) for f in open_tsv_files] if not all(lines_for_each_sample): break if all([not l.startswith('#') and ('Whole-Gene' in l or 'Gene-Exon' in l) for l in lines_for_each_sample]): shared_fields = lines_for_each_sample[0].split('\t')[first_col:first_col+9] reg = report.add_row() reg.add_record('Chr', get_val(shared_fields[0])) reg.add_record('Start', get_int_val(shared_fields[1])) reg.add_record('End', get_int_val(shared_fields[2])) reg.add_record('Size', get_int_val(shared_fields[3])) reg.add_record('Gene', get_val(shared_fields[4])) reg.add_record('Strand', get_val(shared_fields[5])) reg.add_record('Feature', get_val(shared_fields[6])) reg.add_record('Biotype', get_val(shared_fields[7])) reg.add_record('Transcript', get_val(shared_fields[8])) min_depths, ave_depths, stddevs, withins = ([], [], [], []) percents_by_threshs = {t: [] for t in depth_threshs} for l in lines_for_each_sample: fs = l.split('\t') min_depths.append(get_int_val(fs[first_col+9])) ave_depths.append(get_float_val(fs[first_col+10])) stddevs.append(get_float_val(fs[first_col+11])) withins.append(get_float_val(fs[first_col+12])) for t, f in zip(depth_threshs, fs[first_col+13:]): percents_by_threshs[t].append(get_float_val(f)) # counting bests reg.add_record('Min depth', select_best(min_depths)) reg.add_record('Ave depth', select_best(ave_depths)) reg.add_record('Std dev', select_best(stddevs, max)) reg.add_record('W/n 20% of median depth', select_best(withins)) for t in depth_threshs: reg.add_record('{}x'.format(t), select_best(percents_by_threshs[t])) total_regions += 1 for f in open_tsv_files: f.close() gene_report_basename = add_suffix(samples[0].targqc_region_tsv, 'best') txt_rep_fpath = report.save_txt(join(output_dir, gene_report_basename + '.txt')) tsv_rep_fpath = report.save_tsv(join(output_dir, gene_report_basename + '.tsv')) info('') info('Best values for the regions (total ' + str(total_regions) + ') saved into:') info(' ' + txt_rep_fpath) return txt_rep_fpath
def main(): description = ''' Usage: ' + __file__ + ' hg19 [db.gtf] ''' options = [ (['--debug'], dict(dest='debug', action='store_true', default=False)), ] parser = OptionParser(description=description) for args, kwargs in options: parser.add_option(*args, **kwargs) opts, args = parser.parse_args() if len(args) == 0: parser.exit(1, 'Please provide genome name as the first argument') logger.is_debug = opts.debug genome_name = args[0] if len(args) > 1: gtf_fpath = args[1] else: gtf_fpath = ebl.ensembl_gtf_fpath(genome_name) if not isfile(gtf_fpath): if not gtf_fpath.endswith('.gz'): gtf_fpath += '.gz' gtf_fpath = verify_file(gtf_fpath) debug('Reading the GTF database') db = gtf.get_gtf_db(gtf_fpath) debug('Reading biomart data') features_by_ens_id = read_biomart(genome_name) chroms = [c for c, l in ref.get_chrom_lengths(genome_name)] output_fpath = join(dirname(__file__), genome_name, 'ensembl.bed') unsorted_output_fpath = add_suffix(output_fpath, 'unsorted') debug('Processing features, writing to ' + unsorted_output_fpath) def _get(_rec, _key): val = _rec.attributes.get(_key) if val is None: return None assert len(val) == 1, (_key, str(val)) return val[0] num_tx_not_in_biomart = 0 num_tx_diff_gene_in_biomart = 0 with open(unsorted_output_fpath, 'w') as out: out.write('\t'.join(ebl.BedCols.names[i] for i in ebl.BedCols.cols[:-4]) + '\n') for rec in db.all_features(order_by=('seqid', 'start', 'end')): if rec.featuretype == 'gene': continue if rec.chrom not in chroms: continue if rec.end - rec.start < 0: continue tx_id = _get(rec, 'transcript_id') gname = _get(rec, 'gene_name') tx_biotype = _get(rec, 'transcript_biotype') if not tx_biotype: tx_biotype = _get(rec, 'gene_biotype') tsl = _get(rec, 'transcript_support_level') hugo_gene = None biomart_rec = features_by_ens_id.get(tx_id) if not biomart_rec: if rec.featuretype == 'transcript': num_tx_not_in_biomart += 1 else: bm_gname = biomart_rec['Associated Gene Name'] bm_tx_biotype = biomart_rec['Transcript type'] bm_tsl = biomart_rec.get('Transcript Support Level (TSL)') hugo_gene = biomart_rec['HGNC symbol'] if bm_gname != gname: if rec.featuretype == 'transcript': num_tx_diff_gene_in_biomart += 1 continue tx_biotype = bm_tx_biotype tsl = bm_tsl.split()[0].replace('tsl', '') if bm_tsl else None fs = [None] * len(ebl.BedCols.cols[:-3]) if not rec.chrom.startswith('chr'): rec.chrom = 'chr' + rec.chrom.replace('MT', 'M') fs[:6] = [ rec.chrom, str(rec.start - 1), str(rec.end), gname, rec.attributes.get('exon_number', ['.'])[0], rec.strand ] fs[ebl.BedCols.FEATURE] = rec.featuretype or '.' fs[ebl.BedCols.BIOTYPE] = tx_biotype or '.' fs[ebl.BedCols.ENSEMBL_ID] = tx_id or '.' # fs[ebl.BedCols.REFSEQ_ID] = refseq_id or '.' # fs[ebl.BedCols.IS_CANONICAL] = 'canonical' if refseq_id in canonical_transcripts_ids else '' fs[ebl.BedCols.TSL] = tsl or '.' fs[ebl.BedCols.HUGO] = hugo_gene or '.' # fs[ebl.BedCols.names[ensembl.BedCols.GC]] = gc out.write('\t'.join(fs) + '\n') if num_tx_not_in_biomart: warn(str(num_tx_not_in_biomart) + ' transcripts not found in biomart') if num_tx_diff_gene_in_biomart: warn( str(num_tx_diff_gene_in_biomart) + ' transcripts have a different gene name in biomart') debug('Sorting results') sort_bed(unsorted_output_fpath, output_fpath, fai_fpath=ref.get_fai(genome_name), genome=genome_name) os.remove(unsorted_output_fpath) bgzip_and_tabix(output_fpath)
def _save_best_details_for_each_gene(depth_threshs, samples, output_dir): metric_storage = get_detailed_metric_storage(depth_threshs) report = PerRegionSampleReport(sample='Best', metric_storage=metric_storage) report.add_record( 'Sample', 'contains best values from all samples: ' + ', '.join([s.name for s in samples])) total_regions = 0 fpaths = [ s.targqc_region_tsv for s in samples if verify_file(s.targqc_region_tsv) ] if not fpaths: err('No targetcov detailed per-gene report was generated; skipping.') return None open_tsv_files = [open(fpath) for fpath in fpaths] first_col = 0 while True: lines_for_each_sample = [next(f, None) for f in open_tsv_files] if not all(lines_for_each_sample): break l = lines_for_each_sample[0] if l.startswith('##'): continue elif l.startswith('#'): if l.startswith('#Sample'): first_col = 1 break while True: lines_for_each_sample = [next(f, None) for f in open_tsv_files] if not all(lines_for_each_sample): break if all([ not l.startswith('#') and ('Whole-Gene' in l or 'Gene-Exon' in l) for l in lines_for_each_sample ]): shared_fields = lines_for_each_sample[0].split( '\t')[first_col:first_col + 9] reg = report.add_row() reg.add_record('Chr', get_val(shared_fields[0])) reg.add_record('Start', get_int_val(shared_fields[1])) reg.add_record('End', get_int_val(shared_fields[2])) reg.add_record('Size', get_int_val(shared_fields[3])) reg.add_record('Gene', get_val(shared_fields[4])) reg.add_record('Strand', get_val(shared_fields[5])) reg.add_record('Feature', get_val(shared_fields[6])) reg.add_record('Biotype', get_val(shared_fields[7])) reg.add_record('Transcript', get_val(shared_fields[8])) min_depths, ave_depths, stddevs, withins = ([], [], [], []) percents_by_threshs = {t: [] for t in depth_threshs} for l in lines_for_each_sample: fs = l.split('\t') min_depths.append(get_int_val(fs[first_col + 9])) ave_depths.append(get_float_val(fs[first_col + 10])) stddevs.append(get_float_val(fs[first_col + 11])) withins.append(get_float_val(fs[first_col + 12])) for t, f in zip(depth_threshs, fs[first_col + 13:]): percents_by_threshs[t].append(get_float_val(f)) # counting bests reg.add_record('Min depth', select_best(min_depths)) reg.add_record('Ave depth', select_best(ave_depths)) reg.add_record('Std dev', select_best(stddevs, max)) reg.add_record('W/n 20% of median depth', select_best(withins)) for t in depth_threshs: reg.add_record('{}x'.format(t), select_best(percents_by_threshs[t])) total_regions += 1 for f in open_tsv_files: f.close() gene_report_basename = add_suffix(samples[0].targqc_region_tsv, 'best') txt_rep_fpath = report.save_txt( join(output_dir, gene_report_basename + '.txt')) tsv_rep_fpath = report.save_tsv( join(output_dir, gene_report_basename + '.tsv')) info('') info('Best values for the regions (total ' + str(total_regions) + ') saved into:') info(' ' + txt_rep_fpath) return txt_rep_fpath
def run_multisample_qualimap(output_dir, work_dir, samples, targqc_full_report): """ 1. Generates Qualimap2 plots and put into plots_dirpath 2. Adds records to targqc_full_report.plots """ plots_dirpath = join(output_dir, 'plots') individual_report_fpaths = [s.qualimap_html_fpath for s in samples] if isdir(plots_dirpath) and not any( not can_reuse(join(plots_dirpath, f), individual_report_fpaths) for f in listdir(plots_dirpath) if not f.startswith('.')): debug('Qualimap miltisample plots exist - ' + plots_dirpath + ', reusing...') else: # Qualimap2 run for multi-sample plots if len( [s.qualimap_html_fpath for s in samples if s.qualimap_html_fpath]) > 0: if find_executable( ) is not None: # and get_qualimap_type(find_executable()) == 'full': qualimap_output_dir = join(work_dir, 'qualimap_multi_bamqc') _correct_qualimap_genome_results(samples) _correct_qualimap_insert_size_histogram(samples) safe_mkdir(qualimap_output_dir) rows = [] for sample in samples: if sample.qualimap_html_fpath: rows += [[sample.name, sample.qualimap_html_fpath]] data_fpath = write_tsv_rows( ([], rows), join(qualimap_output_dir, 'qualimap_results_by_sample.tsv')) qualimap_plots_dirpath = join(qualimap_output_dir, 'images_multisampleBamQcReport') cmdline = find_executable( ) + ' multi-bamqc --data {data_fpath} -outdir {qualimap_output_dir}'.format( **locals()) run(cmdline, env_vars=dict(DISPLAY=None), checks=[lambda _1, _2: verify_dir(qualimap_output_dir)], reuse=cfg.reuse_intermediate) if not verify_dir(qualimap_plots_dirpath): warn( 'Warning: Qualimap for multi-sample analysis failed to finish. TargQC will not contain plots.' ) return None else: if exists(plots_dirpath): shutil.rmtree(plots_dirpath) shutil.move(qualimap_plots_dirpath, plots_dirpath) else: warn( 'Warning: Qualimap for multi-sample analysis was not found. TargQC will not contain plots.' ) return None targqc_full_report.plots = [] for plot_fpath in listdir(plots_dirpath): plot_fpath = join(plots_dirpath, plot_fpath) if verify_file(plot_fpath) and plot_fpath.endswith('.png'): targqc_full_report.plots.append(relpath(plot_fpath, output_dir))
def main(): description = ''' Usage: ' + __file__ + ' hg19 [db.gtf] ''' options = [ (['--debug'], dict(dest='debug', action='store_true', default=False)), ] parser = OptionParser(description=description) for args, kwargs in options: parser.add_option(*args, **kwargs) opts, args = parser.parse_args() if len(args) == 0: parser.exit(1, 'Please provide genome name as the first argument') logger.is_debug = opts.debug genome_name = args[0] if len(args) > 1: gtf_fpath = args[1] else: gtf_fpath = ebl.ensembl_gtf_fpath(genome_name) if not isfile(gtf_fpath): if not gtf_fpath.endswith('.gz'): gtf_fpath += '.gz' gtf_fpath = verify_file(gtf_fpath) debug('Reading the GTF database') db = gtf.get_gtf_db(gtf_fpath) debug('Reading biomart data') features_by_ens_id = read_biomart(genome_name) chroms = [c for c, l in ref.get_chrom_lengths(genome_name)] output_fpath = join(dirname(__file__), genome_name, 'ensembl.bed') unsorted_output_fpath = add_suffix(output_fpath, 'unsorted') debug('Processing features, writing to ' + unsorted_output_fpath) def _get(_rec, _key): val = _rec.attributes.get(_key) if val is None: return None assert len(val) == 1, (_key, str(val)) return val[0] num_tx_not_in_biomart = 0 num_tx_diff_gene_in_biomart = 0 with open(unsorted_output_fpath, 'w') as out: out.write('\t'.join(ebl.BedCols.names[i] for i in ebl.BedCols.cols[:-4]) + '\n') for rec in db.all_features(order_by=('seqid', 'start', 'end')): if rec.featuretype == 'gene': continue if rec.chrom not in chroms: continue if rec.end - rec.start < 0: continue tx_id = _get(rec, 'transcript_id') gname = _get(rec, 'gene_name') tx_biotype = _get(rec, 'transcript_biotype') if not tx_biotype: tx_biotype = _get(rec, 'gene_biotype') tsl = _get(rec, 'transcript_support_level') hugo_gene = None biomart_rec = features_by_ens_id.get(tx_id) if not biomart_rec: if rec.featuretype == 'transcript': num_tx_not_in_biomart += 1 else: bm_gname = biomart_rec['Associated Gene Name'] bm_tx_biotype = biomart_rec['Transcript type'] bm_tsl = biomart_rec.get('Transcript Support Level (TSL)') hugo_gene = biomart_rec['HGNC symbol'] if bm_gname != gname: if rec.featuretype == 'transcript': num_tx_diff_gene_in_biomart += 1 continue tx_biotype = bm_tx_biotype tsl = bm_tsl.split()[0].replace('tsl', '') if bm_tsl else None fs = [None] * len(ebl.BedCols.cols[:-3]) if not rec.chrom.startswith('chr'): rec.chrom = 'chr' + rec.chrom.replace('MT', 'M') fs[:6] = [rec.chrom, str(rec.start - 1), str(rec.end), gname, rec.attributes.get('exon_number', ['.'])[0], rec.strand] fs[ebl.BedCols.FEATURE] = rec.featuretype or '.' fs[ebl.BedCols.BIOTYPE] = tx_biotype or '.' fs[ebl.BedCols.ENSEMBL_ID] = tx_id or '.' # fs[ebl.BedCols.REFSEQ_ID] = refseq_id or '.' # fs[ebl.BedCols.IS_CANONICAL] = 'canonical' if refseq_id in canonical_transcripts_ids else '' fs[ebl.BedCols.TSL] = tsl or '.' fs[ebl.BedCols.HUGO] = hugo_gene or '.' # fs[ebl.BedCols.names[ensembl.BedCols.GC]] = gc out.write('\t'.join(fs) + '\n') if num_tx_not_in_biomart: warn(str(num_tx_not_in_biomart) + ' transcripts not found in biomart') if num_tx_diff_gene_in_biomart: warn(str(num_tx_diff_gene_in_biomart) + ' transcripts have a different gene name in biomart') debug('Sorting results') sort_bed(unsorted_output_fpath, output_fpath, fai_fpath=ref.get_fai(genome_name), genome=genome_name) os.remove(unsorted_output_fpath) bgzip_and_tabix(output_fpath)
def _make_target_bed(self, bed_fpath, work_dir, output_dir, is_debug, padding=None, fai_fpath=None, genome=None, reannotate=False): clean_target_bed_fpath = intermediate_fname(work_dir, bed_fpath, 'clean') if not can_reuse(clean_target_bed_fpath, bed_fpath): debug() debug('Cleaning target BED file...') bed = BedTool(bed_fpath) if bed.field_count() > 4: bed = bed.cut(range(4)) bed = bed\ .filter(lambda x: x.chrom and not any(x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser']))\ .remove_invalid() with file_transaction(work_dir, clean_target_bed_fpath) as tx: bed.saveas(tx) debug('Saved to ' + clean_target_bed_fpath) verify_file(clean_target_bed_fpath, is_critical=True) sort_target_bed_fpath = intermediate_fname(work_dir, clean_target_bed_fpath, 'sorted') if not can_reuse(sort_target_bed_fpath, clean_target_bed_fpath): debug() debug('Sorting target BED file...') sort_target_bed_fpath = sort_bed( clean_target_bed_fpath, output_bed_fpath=sort_target_bed_fpath, fai_fpath=fai_fpath) debug('Saved to ' + sort_target_bed_fpath) verify_file(sort_target_bed_fpath, is_critical=True) if genome in ebl.SUPPORTED_GENOMES: ann_target_bed_fpath = intermediate_fname(work_dir, sort_target_bed_fpath, 'ann_plus_features') if not can_reuse(ann_target_bed_fpath, sort_target_bed_fpath): debug() if BedTool(sort_target_bed_fpath).field_count( ) == 3 or reannotate: debug( 'Annotating target BED file and collecting overlapping genome features' ) overlap_with_features(sort_target_bed_fpath, ann_target_bed_fpath, work_dir=work_dir, genome=genome, extended=True, reannotate=reannotate, only_canonical=True) else: debug('Overlapping with genomic features:') overlap_with_features(sort_target_bed_fpath, ann_target_bed_fpath, work_dir=work_dir, genome=genome, extended=True, only_canonical=True) debug('Saved to ' + ann_target_bed_fpath) verify_file(ann_target_bed_fpath, is_critical=True) else: ann_target_bed_fpath = sort_target_bed_fpath final_clean_target_bed_fpath = intermediate_fname( work_dir, ann_target_bed_fpath, 'clean') if not can_reuse(final_clean_target_bed_fpath, ann_target_bed_fpath): bed = BedTool(ann_target_bed_fpath).remove_invalid() with file_transaction(work_dir, final_clean_target_bed_fpath) as tx: bed.saveas(tx) pass verify_file(final_clean_target_bed_fpath, is_critical=True) self.bed_fpath = final_clean_target_bed_fpath self.bed = BedTool(self.bed_fpath) self.capture_bed_fpath = add_suffix( join(output_dir, basename(bed_fpath)), 'clean_sorted_ann') if not can_reuse(self.capture_bed_fpath, self.bed_fpath): with file_transaction(work_dir, self.capture_bed_fpath) as tx: self.get_capture_bed().saveas(tx) gene_key_set, gene_key_list = get_genes_from_bed(bed_fpath) self.gene_keys_set = gene_key_set self.gene_keys_list = gene_key_list self.regions_num = self.get_capture_bed().count() self._make_qualimap_bed(work_dir) if padding: self._make_padded_bed(work_dir, fai_fpath, padding)
def parse_qualimap_results(sample): if not verify_file(sample.qualimap_html_fpath): critical('QualiMap report was not found') qualimap_value_by_metric = report_parser.parse_qualimap_sample_report(sample.qualimap_html_fpath) bases_by_depth, median_depth = parse_qualimap_coverage_hist(sample.qualimap_cov_hist_fpath) median_gc, median_human_gc = parse_qualimap_gc_content(sample.qualimap_gc_hist_fpath) median_ins_size = parse_qualimap_insert_size(sample.qualimap_ins_size_hist_fpath) def find_rec(name, percent=False, on_target=True): if on_target: name_on_target = name + ' (on target)' if percent: name_on_target += ' %' res = qualimap_value_by_metric.get(name_on_target) if res: return res if percent: name += ' %' return qualimap_value_by_metric.get(name) depth_stats = dict( ave_depth = find_rec('Coverage Mean'), stddev_depth = find_rec('Coverage Standard Deviation'), median_depth = median_depth, bases_by_depth = bases_by_depth ) target_stats = dict( reference_size = find_rec('Reference size'), target_size = find_rec('Regions size/percentage of reference'), target_fraction = find_rec('Regions size/percentage of reference', percent=True), ) reads_stats = dict( total = find_rec('Number of reads'), mapped = find_rec('Mapped reads', on_target=False), mapped_rate = find_rec('Mapped reads', percent=True, on_target=False), unmapped = find_rec('Unmapped reads'), unmapped_rate = find_rec('Unmapped reads', percent=True), mapped_on_target = find_rec('Mapped reads (on target)'), mapped_rate_on_target = find_rec('Mapped reads (on target)', percent=True), mapped_paired = find_rec('Mapped paired reads', on_target=False), mapped_paired_rate = find_rec('Mapped paired reads', percent=True, on_target=False), paired = find_rec('Paired reads'), paired_rate = find_rec('Paired reads', percent=True), dup = find_rec('Duplicated reads (flagged)'), dup_rate = find_rec('Duplicated reads (flagged)', percent=True), min_len = find_rec('Read min length'), max_len = find_rec('Read max length'), ave_len = find_rec('Read mean length'), median_gc = median_gc, median_human_gc = median_human_gc, median_ins_size = median_ins_size, ) indels_stats = dict( mean_mq = find_rec('Mean Mapping Quality'), mismatches = find_rec('Mismatches'), insertions = find_rec('Insertions'), deletions = find_rec('Deletions'), homo_indels = find_rec('Homopolymer indels'), ) return depth_stats, reads_stats, indels_stats, target_stats