def open_gzipsafe(f, mode='r'): # mode_t = mode.replace('b', '') # mode_b = mode if 'b' in mode else mode + 'b' if f.endswith('.gz') or f.endswith('.gzip') or f.endswith( '.gz.tx') or f.endswith('.gzip.tx'): try: h = gzip.open(f, mode=mode + 't') except IOError as e: err('Error opening gzip ' + f + ': ' + str(e) + ', opening as plain text') return open(f, mode=mode) else: if 'w' in mode: return h else: try: h.read(1) except IOError as e: err('Error opening gzip ' + f + ': ' + str(e) + ', opening as plain text') h.close() return open(f, mode=mode) else: h.close() h = gzip.open(f, mode=mode + 't') return h else: return open(f, mode=mode)
def bgzip_and_tabix(fpath, reuse=False, tabix_parameters='', **kwargs): gzipped_fpath = join(fpath + '.gz') tbi_fpath = gzipped_fpath + '.tbi' if reuse and \ file_exists(gzipped_fpath) and (getctime(gzipped_fpath) >= getctime(fpath) if file_exists(fpath) else True) and \ file_exists(tbi_fpath) and getctime(tbi_fpath) >= getctime(gzipped_fpath): info('Actual compressed file and index exist, reusing') return gzipped_fpath info('Compressing and tabixing file, writing ' + gzipped_fpath + '(.tbi)') bgzip = which('bgzip') tabix = which('tabix') if not bgzip: err('Cannot index file because bgzip is not found') if not tabix: err('Cannot index file because tabix is not found') if not bgzip and not tabix: return fpath if isfile(gzipped_fpath): os.remove(gzipped_fpath) if isfile(tbi_fpath): os.remove(tbi_fpath) info('BGzipping ' + fpath) cmdline = '{bgzip} {fpath}'.format(**locals()) call_process.run(cmdline) info('Tabixing ' + gzipped_fpath) cmdline = '{tabix} {tabix_parameters} {gzipped_fpath}'.format(**locals()) call_process.run(cmdline) return gzipped_fpath
def file_exists_check(output_fpath=None, input_fpath=None): if output_fpath is None: return True ok = os.path.exists(output_fpath) if not ok: err("Did not find output file {0}".format(output_fpath)) return ok
def file_nonempty_check(output_fpath=None, input_fpath=None): if output_fpath is None: return True ok = file_exists_check(output_fpath) if not ok: err("Did not find non-empty output file {0}".format(output_fpath)) return ok
def open_gzipsafe(f, mode='r'): # mode_t = mode.replace('b', '') # mode_b = mode if 'b' in mode else mode + 'b' if f.endswith('.gz') or f.endswith('.gzip') or f.endswith('.gz.tx') or f.endswith('.gzip.tx'): try: h = gzip.open(f, mode=mode + 't') except IOError as e: err('Error opening gzip ' + f + ': ' + str(e) + ', opening as plain text') return open(f, mode=mode) else: if 'w' in mode: return h else: try: h.read(1) except IOError as e: err('Error opening gzip ' + f + ': ' + str(e) + ', opening as plain text') h.close() return open(f, mode=mode) else: h.close() h = gzip.open(f, mode=mode + 't') return h else: return open(f, mode=mode)
def count_bed_cols(bed_fpath): with open(bed_fpath) as f: for l in f: if l and l.strip() and not l.startswith('#'): return len(l.split('\t')) # return len(next(dropwhile(lambda x: x.strip().startswith('#'), open(bed_fpath))).split('\t')) err('Empty bed file: ' + bed_fpath) return None
def load_yaml_config(fpath): verify_file(fpath, is_critical=True) try: dic = load_yaml(open(fpath)) except Exception: err(format_exc()) critical('Could not parse bcbio YAML ' + fpath) else: return dic
def bam_to_bed_nocnf(bam_fpath, bedtools='bedtools', gzip='gzip'): info( 'Converting the BAM to BED to save some memory.' ) # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/ bam_bed_fpath = splitext_plus(bam_fpath)[0] + '.bed.gz' cmdline = '{bedtools} bamtobed -i {bam_fpath} | {gzip} > {bam_bed_fpath}'.format( **locals()) info(cmdline) os.system(cmdline) bam_bed_fpath = verify_file(bam_bed_fpath) if bam_bed_fpath: info('Done, saved to ' + bam_bed_fpath) else: err('Error, result is non-existent or empty') return bam_bed_fpath
def find_fastq_pairs(fpaths): info('Finding FastQ pairs...') fastqs_by_sample_name = dict() for fpath in fpaths: fn, ext = splitext_plus(basename(fpath)) if ext in ['.fq', '.fq.gz', '.fastq', '.fastq.gz']: sname, l_fpath, r_fpath = None, None, None if fn.endswith('_1'): sname = fn[:-2] l_fpath = fpath if fn.endswith('_R1'): sname = fn[:-3] l_fpath = fpath if fn.endswith('_2'): sname = fn[:-2] r_fpath = fpath if fn.endswith('_R2'): sname = fn[:-3] r_fpath = fpath if sname: m = re.match(r'(.*)_S\d+', sname) if m: sname = m.group(1) sname = sname.replace('-', '_') else: sname = fn info('Cannot detect file for ' + sname) l, r = fastqs_by_sample_name.get(sname, (None, None)) if l and l_fpath: critical('Duplicated left FastQ files for ' + sname + ': ' + l + ' and ' + l_fpath) if r and r_fpath: critical('Duplicated right FastQ files for ' + sname + ': ' + r + ' and ' + r_fpath) fastqs_by_sample_name[sname] = l or l_fpath, r or r_fpath fixed_fastqs_by_sample_name = dict() for sname, (l, r) in fastqs_by_sample_name.items(): if not l: err('ERROR: for sample ' + sname + ', left reads not found') if not r: err('ERROR: for sample ' + sname + ', right reads not found') if l and r: fixed_fastqs_by_sample_name[sname] = l, r return fixed_fastqs_by_sample_name
def run(self, fn, param_lists): if self.n_samples == 0: return [] assert self.n_samples == len(param_lists) n_params = len(param_lists[0]) for sample_i, params in enumerate(param_lists): if params is None: err('Parameter list for sample ' + str(sample_i) + ' is None') if len(params) != n_params: err('Parameter list for sample ' + str(sample_i) + ' (' + str(len(params)) + ') does not equal to the one for sample 1 (' + str(n_params) + ')') res = self._view.view.map( fn, *([params[param_i] for params in param_lists] for param_i in range(n_params))) return res
def file_reasonable_size(output_fpath, input_fpath): ok = file_exists_check(output_fpath) if not ok: return ok # named pipes -- we can't calculate size if input_fpath.strip().startswith("<("): return True if input_fpath.endswith((".bam", ".gz")): scale = 7.0 else: scale = 10.0 orig_size = os.path.getsize(input_fpath) / pow(1024.0, 3) out_size = os.path.getsize(output_fpath) / pow(1024.0, 3) if out_size < (orig_size / scale): err("Output file unexpectedly small. %.1fGb for output versus " "%.1fGb for the input file. This often indicates a truncated " "BAM file or memory errors during the run." % (out_size, orig_size)) return False else: return True
def proc_fastq(samples, parall_view, work_dir, bwa_prefix, downsample_to, num_pairs_by_sample=None, dedup=True): num_pairs_by_sample = num_pairs_by_sample or dict() if downsample_to: # Read pairs counts debug() if all(s.name in num_pairs_by_sample for s in samples): debug('Using read pairs counts extracted from FastQC reports') elif all( can_reuse(make_pair_counts_fpath(join(work_dir, s.name)), s.l_fpath) for s in samples): debug('Reusing pairs counts, reading from files') num_pairs_by_sample = { s.name: int( open(make_pair_counts_fpath(join(work_dir, s.name))).read().strip()) for s in samples } else: info('Counting read pairs') num_pairs = parall_view.run( count_read_pairs, [[s.name, safe_mkdir(join(work_dir, s.name)), s.l_fpath] for s in samples]) num_pairs_by_sample = { s.name: pairs_count for s, pairs_count in zip(samples, num_pairs) } # Downsampling debug() if all( can_reuse( make_downsampled_fpath(join(work_dir, s.name), s.l_fpath), s.l_fpath) and can_reuse( make_downsampled_fpath(join(work_dir, s.name), s.r_fpath), s.r_fpath) for s in samples): debug('Reusing downsampled FastQ') for s in samples: s.l_fpath = make_downsampled_fpath(join(work_dir, s.name), s.l_fpath) s.r_fpath = make_downsampled_fpath(join(work_dir, s.name), s.r_fpath) else: if isinstance(downsample_to, float): info('Downsampling FastQ to ' + str(float(downsample_to)) + ' fraction of reads') else: info('Downsampling FastQ to ' + str(int(downsample_to)) + ' read pairs') fastq_pairs = parall_view.run(downsample, [[ join(work_dir, s.name), s.name, s.l_fpath, s.r_fpath, downsample_to, num_pairs_by_sample.get(s.name) ] for s in samples]) for s, (l_r, r_r) in zip(samples, fastq_pairs): s.l_fpath = l_r s.r_fpath = r_r else: info('Skipping downsampling') debug() if all( can_reuse(make_bam_fpath(join(work_dir, s.name)), [s.l_fpath, s.r_fpath]) for s in samples): debug('All downsampled BAM exists, reusing') for s in samples: s.bam = make_bam_fpath(join(work_dir, s.name)) else: bwa = which('bwa') if not isfile(bwa): critical('BWA not found under ' + bwa) smb = sambamba.get_executable() if not (bwa and smb): if not bwa: err('Error: bwa is required for the alignment pipeline') if not smb: err('Error: sambamba is required for the alignment pipeline') critical('Tools required for alignment not found') info('Aligning reads to the reference') bam_fpaths = parall_view.run(align, [[ join(work_dir, s.name), s.name, s.l_fpath, s.r_fpath, bwa, smb, bwa_prefix, dedup, parall_view.cores_per_job ] for s in samples]) bam_fpaths = [verify_bam(b) for b in bam_fpaths] if len(bam_fpaths) < len(samples): critical('Some samples were not aligned successfully.') for bam, s in zip(bam_fpaths, samples): s.bam = bam return num_pairs_by_sample
def _save_best_details_for_each_gene(depth_threshs, samples, output_dir): metric_storage = get_detailed_metric_storage(depth_threshs) report = PerRegionSampleReport(sample='Best', metric_storage=metric_storage) report.add_record( 'Sample', 'contains best values from all samples: ' + ', '.join([s.name for s in samples])) total_regions = 0 fpaths = [ s.targqc_region_tsv for s in samples if verify_file(s.targqc_region_tsv) ] if not fpaths: err('No targetcov detailed per-gene report was generated; skipping.') return None open_tsv_files = [open(fpath) for fpath in fpaths] first_col = 0 while True: lines_for_each_sample = [next(f, None) for f in open_tsv_files] if not all(lines_for_each_sample): break l = lines_for_each_sample[0] if l.startswith('##'): continue elif l.startswith('#'): if l.startswith('#Sample'): first_col = 1 break while True: lines_for_each_sample = [next(f, None) for f in open_tsv_files] if not all(lines_for_each_sample): break if all([ not l.startswith('#') and ('Whole-Gene' in l or 'Gene-Exon' in l) for l in lines_for_each_sample ]): shared_fields = lines_for_each_sample[0].split( '\t')[first_col:first_col + 9] reg = report.add_row() reg.add_record('Chr', get_val(shared_fields[0])) reg.add_record('Start', get_int_val(shared_fields[1])) reg.add_record('End', get_int_val(shared_fields[2])) reg.add_record('Size', get_int_val(shared_fields[3])) reg.add_record('Gene', get_val(shared_fields[4])) reg.add_record('Strand', get_val(shared_fields[5])) reg.add_record('Feature', get_val(shared_fields[6])) reg.add_record('Biotype', get_val(shared_fields[7])) reg.add_record('Transcript', get_val(shared_fields[8])) min_depths, ave_depths, stddevs, withins = ([], [], [], []) percents_by_threshs = {t: [] for t in depth_threshs} for l in lines_for_each_sample: fs = l.split('\t') min_depths.append(get_int_val(fs[first_col + 9])) ave_depths.append(get_float_val(fs[first_col + 10])) stddevs.append(get_float_val(fs[first_col + 11])) withins.append(get_float_val(fs[first_col + 12])) for t, f in zip(depth_threshs, fs[first_col + 13:]): percents_by_threshs[t].append(get_float_val(f)) # counting bests reg.add_record('Min depth', select_best(min_depths)) reg.add_record('Ave depth', select_best(ave_depths)) reg.add_record('Std dev', select_best(stddevs, max)) reg.add_record('W/n 20% of median depth', select_best(withins)) for t in depth_threshs: reg.add_record('{}x'.format(t), select_best(percents_by_threshs[t])) total_regions += 1 for f in open_tsv_files: f.close() gene_report_basename = add_suffix(samples[0].targqc_region_tsv, 'best') txt_rep_fpath = report.save_txt( join(output_dir, gene_report_basename + '.txt')) tsv_rep_fpath = report.save_tsv( join(output_dir, gene_report_basename + '.tsv')) info('') info('Best values for the regions (total ' + str(total_regions) + ') saved into:') info(' ' + txt_rep_fpath) return txt_rep_fpath
def proc_fastq(samples, parall_view, work_dir, bwa_prefix, downsample_to, num_pairs_by_sample=None, dedup=True): num_pairs_by_sample = num_pairs_by_sample or dict() if downsample_to: # Read pairs counts debug() if all(s.name in num_pairs_by_sample for s in samples): debug('Using read pairs counts extracted from FastQC reports') elif all(can_reuse(make_pair_counts_fpath(join(work_dir, s.name)), s.l_fpath) for s in samples): debug('Reusing pairs counts, reading from files') num_pairs_by_sample = {s.name: int(open(make_pair_counts_fpath(join(work_dir, s.name))).read().strip()) for s in samples} else: info('Counting read pairs') num_pairs = parall_view.run(count_read_pairs, [[s.name, safe_mkdir(join(work_dir, s.name)), s.l_fpath] for s in samples]) num_pairs_by_sample = {s.name: pairs_count for s, pairs_count in zip(samples, num_pairs)} # Downsampling debug() if all(can_reuse(make_downsampled_fpath(join(work_dir, s.name), s.l_fpath), s.l_fpath) and can_reuse(make_downsampled_fpath(join(work_dir, s.name), s.r_fpath), s.r_fpath) for s in samples): debug('Reusing downsampled FastQ') for s in samples: s.l_fpath = make_downsampled_fpath(join(work_dir, s.name), s.l_fpath) s.r_fpath = make_downsampled_fpath(join(work_dir, s.name), s.r_fpath) else: if isinstance(downsample_to, float): info('Downsampling FastQ to ' + str(float(downsample_to)) + ' fraction of reads') else: info('Downsampling FastQ to ' + str(int(downsample_to)) + ' read pairs') fastq_pairs = parall_view.run(downsample, [[join(work_dir, s.name), s.name, s.l_fpath, s.r_fpath, downsample_to, num_pairs_by_sample.get(s.name)] for s in samples]) for s, (l_r, r_r) in zip(samples, fastq_pairs): s.l_fpath = l_r s.r_fpath = r_r else: info('Skipping downsampling') debug() if all(can_reuse(make_bam_fpath(join(work_dir, s.name)), [s.l_fpath, s.r_fpath]) for s in samples): debug('All downsampled BAM exists, reusing') for s in samples: s.bam = make_bam_fpath(join(work_dir, s.name)) else: bwa = which('bwa') if not isfile(bwa): critical('BWA not found under ' + bwa) smb = sambamba.get_executable() if not (bwa and smb): if not bwa: err('Error: bwa is required for the alignment pipeline') if not smb: err('Error: sambamba is required for the alignment pipeline') critical('Tools required for alignment not found') info('Aligning reads to the reference') bam_fpaths = parall_view.run(align, [[join(work_dir, s.name), s.name, s.l_fpath, s.r_fpath, bwa, smb, bwa_prefix, dedup, parall_view.cores_per_job] for s in samples]) bam_fpaths = [verify_bam(b) for b in bam_fpaths] if len(bam_fpaths) < len(samples): critical('Some samples were not aligned successfully.') for bam, s in zip(bam_fpaths, samples): s.bam = bam return num_pairs_by_sample
def _save_best_details_for_each_gene(depth_threshs, samples, output_dir): metric_storage = get_detailed_metric_storage(depth_threshs) report = PerRegionSampleReport(sample='Best', metric_storage=metric_storage) report.add_record('Sample', 'contains best values from all samples: ' + ', '.join([s.name for s in samples])) total_regions = 0 fpaths = [s.targqc_region_tsv for s in samples if verify_file(s.targqc_region_tsv)] if not fpaths: err('No targetcov detailed per-gene report was generated; skipping.') return None open_tsv_files = [open(fpath) for fpath in fpaths] first_col = 0 while True: lines_for_each_sample = [next(f, None) for f in open_tsv_files] if not all(lines_for_each_sample): break l = lines_for_each_sample[0] if l.startswith('##'): continue elif l.startswith('#'): if l.startswith('#Sample'): first_col = 1 break while True: lines_for_each_sample = [next(f, None) for f in open_tsv_files] if not all(lines_for_each_sample): break if all([not l.startswith('#') and ('Whole-Gene' in l or 'Gene-Exon' in l) for l in lines_for_each_sample]): shared_fields = lines_for_each_sample[0].split('\t')[first_col:first_col+9] reg = report.add_row() reg.add_record('Chr', get_val(shared_fields[0])) reg.add_record('Start', get_int_val(shared_fields[1])) reg.add_record('End', get_int_val(shared_fields[2])) reg.add_record('Size', get_int_val(shared_fields[3])) reg.add_record('Gene', get_val(shared_fields[4])) reg.add_record('Strand', get_val(shared_fields[5])) reg.add_record('Feature', get_val(shared_fields[6])) reg.add_record('Biotype', get_val(shared_fields[7])) reg.add_record('Transcript', get_val(shared_fields[8])) min_depths, ave_depths, stddevs, withins = ([], [], [], []) percents_by_threshs = {t: [] for t in depth_threshs} for l in lines_for_each_sample: fs = l.split('\t') min_depths.append(get_int_val(fs[first_col+9])) ave_depths.append(get_float_val(fs[first_col+10])) stddevs.append(get_float_val(fs[first_col+11])) withins.append(get_float_val(fs[first_col+12])) for t, f in zip(depth_threshs, fs[first_col+13:]): percents_by_threshs[t].append(get_float_val(f)) # counting bests reg.add_record('Min depth', select_best(min_depths)) reg.add_record('Ave depth', select_best(ave_depths)) reg.add_record('Std dev', select_best(stddevs, max)) reg.add_record('W/n 20% of median depth', select_best(withins)) for t in depth_threshs: reg.add_record('{}x'.format(t), select_best(percents_by_threshs[t])) total_regions += 1 for f in open_tsv_files: f.close() gene_report_basename = add_suffix(samples[0].targqc_region_tsv, 'best') txt_rep_fpath = report.save_txt(join(output_dir, gene_report_basename + '.txt')) tsv_rep_fpath = report.save_tsv(join(output_dir, gene_report_basename + '.tsv')) info('') info('Best values for the regions (total ' + str(total_regions) + ') saved into:') info(' ' + txt_rep_fpath) return txt_rep_fpath