def launch_bedcoverage_hist(work_dir, bed, bam, chr_lengths_fpath, bedcov_output_fpath=None, bedtools='bedtools'): if not bedcov_output_fpath: bedcov_output_fpath = join( work_dir, splitext_plus(basename(bed))[0] + '__' + splitext_plus(basename(bam))[0] + '_bedcov_output.txt') if bam.endswith('bam'): bam = bam_to_bed_nocnf(bam, bedtools) verify_file(bam, is_critical=True, description='BAM to BED conversion result') v = bedtools_version(bedtools) if v and v >= 24: cmdline = '{bedtools} coverage -sorted -g {chr_lengths_fpath} -a {bed} -b {bam} -hist'.format( **locals()) else: cmdline = '{bedtools} coverage -a {bam} -b {bed} -hist'.format( **locals()) cmdline += ' > ' + bedcov_output_fpath info(cmdline) os.system(cmdline) res = verify_file(bedcov_output_fpath) if res: info('Done, saved to ' + bedcov_output_fpath) else: err('Error, result is non-existent or empty')
def set_up_out_dirs(self, fastq_dirpath, fastqc_dirpath, downsample_targqc_dirpath): self.fastq_dirpath = fastq_dirpath self.fastqc_dirpath = fastqc_dirpath self.downsample_targqc_dirpath = downsample_targqc_dirpath self.l_fpath = join(fastq_dirpath, self.name + '_R1.fastq.gz') self.r_fpath = join(fastq_dirpath, self.name + '_R2.fastq.gz') self.sample_fastqc_dirpath = join(fastqc_dirpath, self.name + '.fq_fastqc') self.fastqc_html_fpath = join(fastqc_dirpath, self.name + '.fq_fastqc.html') self.l_fastqc_base_name = splitext_plus(basename(self.l_fpath))[0] self.r_fastqc_base_name = splitext_plus(basename(self.r_fpath))[0] # self.l_fastqc_html_fpath = None # join(ds.fastqc_dirpath, + '_fastqc.html') # self.r_fastqc_html_fpath = None # join(ds.fastqc_dirpath, splitext_plus(self.r_fpath)[0] + '_fastqc.html') if not isfile(self.fastqc_html_fpath): self.fastqc_html_fpath = join(self.sample_fastqc_dirpath, 'fastqc_report.html') self.targqc_sample = TargQC_Sample( self.name, join(downsample_targqc_dirpath, self.name)) self.targetcov_html_fpath = self.targqc_sample.targetcov_html_fpath self.ngscat_html_fpath = self.targqc_sample.ngscat_html_fpath self.qualimap_html_fpath = self.targqc_sample.qualimap_html_fpath
def intersect_bed(cnf, bed1, bed2): bed1_fname, _ = splitext_plus(basename(bed1)) bed2_fname, _ = splitext_plus(basename(bed2)) output_fpath = join(cnf['work_dir'], bed1_fname + '__' + bed2_fname + '.bed') bedtools = get_system_path(cnf, 'bedtools') cmdline = '{bedtools} intersect -u -a {bed1} -b {bed2}'.format(**locals()) call(cnf, cmdline, output_fpath, verify_output_not_empty=False) return output_fpath
def _intersect_with_tricky_regions(cnf, selected_bed_fpath, sample): info() info('Detecting problematic regions for ' + sample) bed_filenames = [fn + '.bed.gz' for fn in tricky_regions_fnames_d.keys()] merged_bed_fpaths = [ join(cnf.genome.tricky_regions, 'merged', bed_filename) for bed_filename in bed_filenames ] info('Intersecting BED ' + selected_bed_fpath + ' using BED files with tricky regions') intersection_fpath = join( cnf.work_dir, splitext_plus(basename(selected_bed_fpath))[0] + '_tricky_vcf_bed.intersect') if not cnf.reuse_intermediate or not verify_file( intersection_fpath, silent=True, is_critical=False): bedtools = get_system_path(cnf, 'bedtools') cmdline = bedtools + ' intersect -header -a ' + selected_bed_fpath + ' -b ' + ' '.join( merged_bed_fpaths) + ' -wo -filenames' call(cnf, cmdline, output_fpath=intersection_fpath, exit_on_error=False) return intersection_fpath
def markdup_bam(cnf, in_bam_fpath, bammarkduplicates=None): """Perform non-stream based deduplication of BAM input files using biobambam. """ if not bammarkduplicates: bammarkduplicates = get_system_path(cnf, 'bammarkduplicates') if not bammarkduplicates: warn('No biobambam bammarkduplicates, can\'t mark duplicates.') return None out_bam_fpath = add_suffix(in_bam_fpath, 'markdup') tmp_fpath = join(cnf.work_dir, splitext_plus(basename(in_bam_fpath))[0] + '_markdup') safe_mkdir(dirname(tmp_fpath)) cmdline = ( '{bammarkduplicates} tmpfile={tmp_fpath} I={in_bam_fpath} O={out_bam_fpath}' ).format(**locals()) res = call(cnf, cmdline, output_fpath=out_bam_fpath, stdout_to_outputfile=False, exit_on_error=False) if res: return out_bam_fpath else: return None
def _tracks(cnf, track_fpath, input_fpath): if not verify_file(track_fpath): return None field_name = splitext_plus(basename(track_fpath))[0] step_greetings('Intersecting with ' + field_name) output_fpath = intermediate_fname(cnf, input_fpath, field_name) if output_fpath.endswith('.gz'): output_fpath = output_fpath[:-3] if cnf.reuse_intermediate and verify_vcf(output_fpath): info('VCF ' + output_fpath + ' exists, reusing...') return output_fpath toolpath = get_system_path(cnf, 'vcfannotate') if not toolpath: err('WARNING: Skipping annotation with tracks: vcfannotate ' 'executable not found, you probably need to specify path in system_config, or ' 'run load bcbio: . /group/ngs/bin/bcbio-prod.sh"') return None # self.all_fields.append(field_name) cmdline = '{toolpath} -b {track_fpath} -k {field_name} {input_fpath}'.format( **locals()) assert input_fpath output_fpath = call_subprocess(cnf, cmdline, input_fpath, output_fpath, stdout_to_outputfile=True, overwrite=True) if not verify_vcf(output_fpath): err('Error: tracks resulted ' + str(output_fpath) + ' for ' + track_fpath) return output_fpath # Set TRUE or FALSE for tracks def proc_line(line, i): if field_name in line: if not line.startswith('#'): fields = line.split('\t') info_line = fields[7] info_pairs = [attr.split('=') for attr in info_line.split(';')] info_pairs = [[pair[0], ('TRUE' if pair[1] else 'FALSE')] if pair[0] == field_name and len(pair) > 1 else pair for pair in info_pairs] info_line = ';'.join( '='.join(pair) if len(pair) == 2 else pair[0] for pair in info_pairs) fields = fields[:7] + [info_line] + fields[8:] return '\t'.join(fields) return line assert output_fpath output_fpath = iterate_file(cnf, output_fpath, proc_line, suffix='trk') return verify_vcf(output_fpath, is_critical=True)
def sambamba_depth(cnf, bed, bam, output_fpath=None, use_grid=False, depth_thresholds=None, sample_name=None, only_depth=False, silent=False): sample_name = sample_name or splitext_plus(basename(bam))[0] if not output_fpath: output_fpath = join( cnf.work_dir, splitext_plus(basename(bed))[0] + '_' + sample_name + '_sambamba_depth.txt') if cnf.reuse_intermediate and verify_file(output_fpath, silent=True): info(output_fpath + ' exists, reusing.') if use_grid: return None else: return output_fpath sambamba = get_system_path(cnf, join(get_ext_tools_dirname(), 'sambamba'), is_critical=True) thresholds_str = '' if not only_depth: depth_thresholds = depth_thresholds or cnf.coverage_reports.depth_thresholds thresholds_str = '-T ' + ' -T'.join([str(d) for d in depth_thresholds]) cmdline = 'depth region -F "not duplicate and not failed_quality_control" -L {bed} {thresholds_str} {bam}'.format( **locals()) return call_sambamba(cnf, cmdline, output_fpath=output_fpath, bam_fpath=bam, sambamba=sambamba, use_grid=use_grid, command_name='depth_' + splitext_plus(basename(bed))[0], sample_name=sample_name, silent=silent)
def create_jbrowse_symlink(genome, project_name, sample, file_fpath): jbrowse_data_path, _, _ = set_folders(genome) jbrowse_dirpath = join(jbrowse_data_path, 'tracks') jbrowse_project_dirpath = join(jbrowse_dirpath, project_name) base, ext = splitext_plus(file_fpath) if ext in ['.tbi', '.bai']: base, ext2 = splitext_plus(base) ext = ext2 + ext sym_link = join(jbrowse_project_dirpath, sample + ext) if not verify_dir(jbrowse_project_dirpath): safe_mkdir(jbrowse_project_dirpath) if isfile(file_fpath) and not isfile(sym_link): try: os.symlink(file_fpath, sym_link) except OSError: warn(traceback.format_exc()) if isfile(sym_link): change_permissions(sym_link) return sym_link
def bam_to_bed_nocnf(bam_fpath, bedtools='bedtools', gzip='gzip'): info( 'Converting the BAM to BED to save some memory.' ) # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/ bam_bed_fpath = splitext_plus(bam_fpath)[0] + '.bed.gz' cmdline = '{bedtools} bamtobed -i {bam_fpath} | {gzip} > {bam_bed_fpath}'.format( **locals()) info(cmdline) os.system(cmdline) bam_bed_fpath = verify_file(bam_bed_fpath) if bam_bed_fpath: info('Done, saved to ' + bam_bed_fpath) else: err('Error, result is non-existent or empty') return bam_bed_fpath
def bam_to_bed(cnf, bam_fpath, to_gzip=True): info( 'Converting the BAM to BED to save some memory.' ) # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/ bam_bed_fpath = splitext_plus(bam_fpath)[0] + ('.bed.gz' if to_gzip else '.bed') bedtools = get_system_path(cnf, 'bedtools') gzip = get_system_path(cnf, 'gzip') cmdline = '{bedtools} bamtobed -i {bam_fpath}'.format(**locals()) cmdline += ' | {gzip}'.format(**locals()) if to_gzip else '' call(cnf, cmdline, output_fpath=bam_bed_fpath, verify_output_not_empty=False) return bam_bed_fpath
def markdup_sam(cnf, in_sam_fpath, samblaster=None): """Perform non-stream based deduplication of SAM input files using samblaster. """ if not samblaster: samblaster = get_system_path(cnf, 'samblaster') if not samblaster: warn('No samblaster, can\'t mark duplicates.') return None out_sam_fpath = add_suffix(in_sam_fpath, 'markdup') tmp_fpath = join(cnf.work_dir, splitext_plus(basename(in_sam_fpath))[0] + '_markdup') safe_mkdir(dirname(tmp_fpath)) cmdline = '{samblaster} -i {in_sam_fpath} -o {out_sam_fpath}'.format( **locals()) res = call(cnf, cmdline, output_fpath=out_sam_fpath, stdout_to_outputfile=False, exit_on_error=False) if res: return out_sam_fpath else: return None
def read_samples_info_and_split(common_cnf, options, inputs): #TODO: _set_up_dirs(cnf) for each sample info('') info('Processing input details...') details = None for key in inputs: if options.get(key): common_cnf[key] = adjust_path(options[key]) info('Using ' + common_cnf[key]) details = [common_cnf] if not details: details = common_cnf.get('details') if not details: critical('Please, provide input ' + ', '.join(inputs) + ' in command line or in run info yaml config.') all_samples = OrderedDict() for one_item_cnf in details: if 'vcf' not in one_item_cnf: critical('ERROR: A section in details does not contain field "var".') one_item_cnf['vcf'] = adjust_path(one_item_cnf['vcf']) verify_file(one_item_cnf['vcf'], 'Input file', is_critical=True) join_parent_conf(one_item_cnf, common_cnf) work_vcf = join(one_item_cnf['work_dir'], basename(one_item_cnf['vcf'])) check_file_changed(one_item_cnf, one_item_cnf['vcf'], work_vcf) if not one_item_cnf.get('reuse_intermediate'): with open_gzipsafe(one_item_cnf['vcf']) as inp, open_gzipsafe(work_vcf, 'w') as out: out.write(inp.read()) one_item_cnf['vcf'] = work_vcf vcf_header_samples = read_sample_names_from_vcf(one_item_cnf['vcf']) # MULTIPLE SAMPELS if ('samples' in one_item_cnf or one_item_cnf.get('split_samples')) and len(vcf_header_samples) == 0: sample_cnfs = _verify_sample_info(one_item_cnf, vcf_header_samples) for header_sample_name in vcf_header_samples: if header_sample_name not in sample_cnfs: sample_cnfs[header_sample_name] = one_item_cnf.copy() if header_sample_name in all_samples: critical('ERROR: duplicated sample name: ' + header_sample_name) cnf = all_samples[header_sample_name] = sample_cnfs[header_sample_name] cnf['name'] = header_sample_name if cnf.get('keep_intermediate'): cnf['log'] = join(cnf['work_dir'], cnf['name'] + '.log') # cnf['vcf'] = extract_sample(cnf, one_item_cnf['vcf'], cnf['name']) info() # SINGLE SAMPLE else: cnf = one_item_cnf if 'bam' in cnf: cnf['bam'] = adjust_path(cnf['bam']) verify_bam(cnf['bam'], is_critical=True) cnf['name'] = splitext_plus(basename(cnf['vcf']))[0] if cnf.get('keep_intermediate'): cnf['log'] = join(cnf['work_dir'], cnf['name'] + '.log') cnf['vcf'] = work_vcf all_samples[cnf['name']] = cnf if not all_samples: info('No samples.') else: info('Using samples: ' + ', '.join(all_samples) + '.') return all_samples
def _extract_fields(cnf, vcf_fpath, samplename, main_sample_index=0): fname, _ = splitext_plus(basename(vcf_fpath)) tsv_fpath = join(cnf.work_dir, fname + '.tsv') if cnf.get('reuse_intermediate'): if file_exists(tsv_fpath): info(tsv_fpath + ' exists, reusing') return tsv_fpath manual_tsv_fields = cnf.annotation['tsv_fields'] if not manual_tsv_fields: return None all_fields = [] basic_fields = [] info_fields = [] eff_fields = [] gt_fields = [] tumor_gt = 'GEN[' + str(main_sample_index) + '].' normal_gt = 'GEN[' + str(1 - main_sample_index) + '].' lines = [] with open(vcf_fpath) as inp: reader = vcf.Reader(inp) info('TSV saver: Building field list') for f in [rec.keys()[0] for rec in manual_tsv_fields]: if f.startswith('GEN'): _f = f.split('.')[1] if len(reader.samples) > 0: if _f in reader.formats: gt_fields.append(_f) all_fields.append(f.replace('GEN[*].', tumor_gt)) if len(reader.samples) > 1: all_fields.append(f.replace('GEN[*].', normal_gt)) else: warn('TSV Saver: Warning: ' + f + ' is not in VCF header FORMAT records') elif f in ['CHROM', 'POS', 'REF', 'ALT', 'ID', 'FILTER', 'QUAL']: all_fields.append(f) basic_fields.append(f) elif any(f.startswith(af) and af in reader.infos for af in ['EFF', 'ANN']): all_fields.append(f) eff_fields.append(f) else: if f in reader.infos: info_fields.append(f) all_fields.append(f) elif f == 'SAMPLE': all_fields.append(f) else: warn('TSV Saver: Warning: ' + f + ' is not in VCF header INFO records') info('TSV saver: Iterating over records...') d = OrderedDict() for rec in reader: for f in basic_fields: d[f] = rec.__dict__[f] for f in info_fields: d[f] = rec.INFO[f] if f in rec.INFO else '' if 'SAMPLE' not in d: d['SAMPLE'] = samplename if eff_fields: eff = rec.INFO.get(eff_fields[0][:3]) if not eff: for f in eff_fields: d[f] = '' else: eff_fs = eff[0].split('|') eff_d = dict() for val, header in zip(eff_fs, ['ALLELE', 'EFFECT', 'IMPACT', 'GENE', 'GENEID', 'FEATURE', 'FEATUREID', 'BIOTYPE', 'RANK', 'HGVS_C', 'HGVS_P', 'CDNA_POSLEN', 'CDS_POSLEN', 'AA_POSLEN', 'DISTANCE', 'LOG']): if 'POSLEN' in header: eff_d[header.split('_')[0] + '_POS'] = val.split('/')[0] if val else '' eff_d[header.split('_')[0] + '_LEN'] = val.split('/')[1] if val else '' else: eff_d[header] = val #ANN=GA |3_prime_UTR_variant|MODIFIER|RPL22|RPL22|transcript|NM_000983.3|Coding|4/4|c.*173dupT|||||173|; #Allele | Annotation | Annotation_Impact | Gene_Name | Gene_ID | Feature_Type | Feature_ID | Transcript_BioType | Rank | HGVS.c | HGVS.p | cDNA.pos / cDNA.length | CDS.pos / CDS.length | AA.pos / AA.length | Distance | ERRORS / WARNINGS / INFO' for f in eff_fields: d[f] = eff_d[f.split('.')[1]] if rec.FORMAT: for _f in gt_fields: if _f in rec.FORMAT: d[tumor_gt + _f] = rec.samples[main_sample_index][_f] if len(rec.samples) > 1 - main_sample_index: d[normal_gt + _f] = rec.samples[1 - main_sample_index][_f] else: d[normal_gt + _f] = '' else: d[tumor_gt + _f] = '' d[normal_gt + _f] = '' fs = [] for f in all_fields: v = d[f] fs.append(v if v != '.' else '') lines.append(fs) info('TSV saver: Adding GEN[*] fields both for sample and for matched normal...') field_map = dict() for rec in manual_tsv_fields: k = rec.keys()[0] v = rec.values()[0] if k.startswith('GEN[*].'): _f = k.split('.')[1] field_map[tumor_gt + _f] = v field_map[normal_gt + _f] = 'Matched_' + v else: field_map[k] = v info('TSV saver: Writing TSV to ' + tsv_fpath) with file_transaction(cnf.work_dir, tsv_fpath) as tx: with open(tx, 'w') as out: out.write('\t'.join(field_map[f] for f in all_fields) + '\n') for fs in lines: new_fs = [] for f in fs: if isinstance(f, list): new_fs.append(','.join(map(str, f))) elif f is None: new_fs.append('') else: new_fs.append(str(f)) out.write('\t'.join(new_fs) + '\n') info('TSV saver: saved ' + tsv_fpath) return tsv_fpath
def read_samples(args, caller_name=None): vcf_fpath_by_sample = OrderedDict() bad_vcf_fpaths = [] info('Reading samples...') if len(args) == 1: first_fpath = args[0] if not first_fpath.endswith('.vcf') and not first_fpath.endswith( '.vcf.gz'): # TODO: check ##fileformat=VCF ? info( 'First argument file name does not look like VCF, assuming TSV with files names' ) with open(first_fpath) as f: for i, l in enumerate(f): fs = l.strip().split('\t') if len(fs) != 2: critical('Line ' + str(i) + ' has only ' + str(len(fs)) + ' fields. Expecting 2 (sample and vcf_fpath)') sn, vcf_fpath = fs if not verify_file(vcf_fpath): bad_vcf_fpaths.append(vcf_fpath) vcf_fpath_by_sample[sn] = adjust_path(vcf_fpath) if bad_vcf_fpaths: critical('VCF files cannot be found, empty or not VCFs:' + ', '.join(bad_vcf_fpaths)) info('Done reading ' + str(len(vcf_fpath_by_sample)) + ' samples') return vcf_fpath_by_sample for arg in args or [os.getcwd()]: vcf_fpath = verify_vcf(arg.split(',')[0]) if not verify_file(vcf_fpath): bad_vcf_fpaths.append(vcf_fpath) if len(arg.split(',')) > 1: sn = arg.split(',')[1] else: sn = basename(splitext_plus(vcf_fpath)[0]) if caller_name and sn.endswith('-' + caller_name): sn = sn[:-len(caller_name) - 1] info(' ' + sn) if sn in vcf_fpath_by_sample: if vcf_fpath_by_sample[sn] != vcf_fpath: warn('Duplicated record ' + sn + ', VCF file is different (existing: ' + vcf_fpath_by_sample[sn] + ', new: ' + vcf_fpath + ')') else: warn('Duplicated record ' + sn + ', VCF file is the same: ' + vcf_fpath) else: vcf_fpath_by_sample[sn] = vcf_fpath if bad_vcf_fpaths: critical('VCF files cannot be found, empty or not VCFs:' + ', '.join(bad_vcf_fpaths)) info('Done reading ' + str(len(vcf_fpath_by_sample)) + ' samples') # TODO: read sample names from VCF # def get_main_sample(self, main_sample_index=None): # if len(self._sample_indexes) == 0: # return None # if main_sample_index is not None: # return self.samples[main_sample_index] # try: # sample_index = [sname.lower() for sname in self._sample_indexes] \ # .index(self.sample_name_from_file.lower()) # except ValueError: # return self.samples[0] # else: # return self.samples[sample_index] return vcf_fpath_by_sample
def make_fastqc_reports(cnf, fastq_fpaths, output_dir): # if isdir(fastqc_dirpath): # if isdir(fastqc_dirpath + '.bak'): # try: # shutil.rmtree(fastqc_dirpath + '.bak') # except OSError: # pass # if not isdir(fastqc_dirpath + '.bak'): # os.rename(fastqc_dirpath, fastqc_dirpath + '.bak') # if isdir(fastqc_dirpath): # err('Could not run and combine fastqc because it already exists and could not be moved to fastqc.bak') # return None fastqc = get_system_path(cnf, 'fastqc') if not fastqc: err('FastQC is not found, cannot make reports') return None else: safe_mkdir(output_dir) fqc_samples = [] fastqc_jobs = [] for fastq_fpath in fastq_fpaths: s = FQC_Sample(name=splitext_plus(basename(fastq_fpath))[0], fastq_fpath=fastq_fpath) fqc_samples.extend([s]) info('Added sample ' + s.name) for fqc_s in fqc_samples: if cnf.reuse_intermediate and verify_file(fqc_s.fastqc_html_fpath, silent=True): info(fqc_s.fastqc_html_fpath + ' exists, reusing') else: fastqc_jobs.append( run_fastqc(cnf, fqc_s.fastq_fpath, fqc_s.name, output_dir)) info() wait_for_jobs(cnf, fastqc_jobs) fastqc_jobs = [] # while True: for fqc_s in fqc_samples: fqc_s.fastqc_html_fpath = find_fastqc_html(output_dir, fqc_s.name) not_done_fqc = [ fqc_s for fqc_s in fqc_samples if not verify_file(fqc_s.fastqc_html_fpath, description='Not found FastQC html for ' + fqc_s.name) ] # if not not_done_fqc: # info('') # info('Every FastQC job is done, moving on.') # info('-' * 70) # break # else: # info('') # info('Some FastQC jobs are not done (' + ', '.join(f.name for f in not_done_fqc) + '). Retrying them.') # info('') # for fqc_s in not_done_fqc: # fastqc_jobs.append(run_fastqc(cnf, fqc_s.fastq_fpath, fqc_s.name, output_dir)) # wait_for_jobs(cnf, fastqc_jobs) for fqc_s in fqc_samples: sample_fastqc_dirpath = join(output_dir, fqc_s.name + '_fastqc') if isfile(sample_fastqc_dirpath + '.zip'): try: os.remove(sample_fastqc_dirpath + '.zip') except OSError: pass comb_fastqc_fpath = join(output_dir, 'fastqc.html') write_fastqc_combo_report(cnf, comb_fastqc_fpath, fqc_samples) verify_file(comb_fastqc_fpath, is_critical=True) info('Combined FastQC saved to ' + comb_fastqc_fpath) return comb_fastqc_fpath