def extract_variant_from_bams(cnf, out_dirpath, transcripts, chr_length, samples, chrom, variant, bams_created_before): padding = 500 sambamba = get_system_path(cnf, join(get_ext_tools_dirname(), 'sambamba'), is_critical=True) pos, ref, alt, variant_transcripts = variant['pos'], variant['ref'], variant['alt'], variant['transcripts'] bam_prefix = None for transcript in variant_transcripts: transcript_name = sorted(variant_transcripts)[0] transcript_exons = transcripts[(transcript, chrom)] for idx, exon in enumerate(transcript_exons): if exon['start'] <= pos <= exon['stop']: start, end = exon['start'], exon['stop'] bam_prefix = '{chrom}-{transcript_name}-{idx}-'.format(**locals()) if bam_prefix: break if not bam_prefix: start, end = max(1, pos - padding), min(chr_length, pos + padding) ref_ = ref[:20] alt_ = alt[:20] bam_prefix = '{chrom}-{pos}-{ref_}-{alt_}-'.format(**locals()) bams_by_sample = dict() for sample in samples: sample_name = sample.name.replace('-', '_') output_bam_fpath = join(out_dirpath, bam_prefix + '{sample_name}.bam'.format(**locals())) if output_bam_fpath in bams_created_before: continue if cnf.reuse_intermediate and verify_file(output_bam_fpath, silent=True): bams_by_sample[sample.name] = output_bam_fpath else: cmdline = '{sambamba} slice {sample.bam} {chrom}:{start}-{end} -o {output_bam_fpath}'.format(**locals()) call(cnf, cmdline, silent=not cnf.verbose) if verify_file(output_bam_fpath, silent=True): cmdline = '{sambamba} index {output_bam_fpath}'.format(**locals()) call(cnf, cmdline, silent=not cnf.verbose) bams_by_sample[sample.name] = output_bam_fpath return bams_by_sample
def __final_seq2c_scripts(cnf, read_stats_fpath, combined_gene_depths_fpath, output_fpath): cov2lr = get_script_cmdline(cnf, 'perl', join('Seq2C', 'cov2lr.pl'), is_critical=True) cov2lr_output = join(cnf.work_dir, splitext(basename(output_fpath))[0] + '.cov2lr.tsv') controls = '' lr2gene_opt = '' if cnf.controls: controls = '-c ' + cnf.controls # ':'.join([adjust_path(fpath) for fpath in cnf.controls.split(':')]) lr2gene_opt = '-c' cmdline = '{cov2lr} -a {controls} {read_stats_fpath} {combined_gene_depths_fpath}'.format(**locals()) call(cnf, cmdline, cov2lr_output, exit_on_error=False) info() if not verify_file(cov2lr_output): return None seq2c_opts = cnf.seq2c_opts or '' lr2gene = get_script_cmdline(cnf, 'perl', join('Seq2C', 'lr2gene.pl'), is_critical=True) cmdline = '{lr2gene} {lr2gene_opt} {seq2c_opts} {cov2lr_output}'.format(**locals()) res = call(cnf, cmdline, output_fpath, exit_on_error=False) info() if not verify_file(output_fpath): return None return res
def annotate_target(cnf, target_bed): output_fpath = intermediate_fname(cnf, target_bed, 'ann') if not cnf.genome.bed_annotation_features: return output_fpath if can_reuse(output_fpath, target_bed): info(output_fpath + ' exists, reusing') return output_fpath features_bed = verify_bed( cnf.genome.bed_annotation_features, is_critical=True, description='bed_annotation_features in system config') # annotate_bed_py = get_system_path(cnf, 'python', join('tools', 'bed_processing', 'annotate_bed.py')) # bedtools = get_system_path(cnf, 'bedtools') annotate_bed_py = which('annotate_bed.py') if not annotate_bed_py: critical( 'Error: annotate_bed.py not found in PATH, please install TargQC.') cmdline = '{annotate_bed_py} {target_bed} --work-dir {cnf.work_dir} -g {cnf.genome.name} ' \ '-o {output_fpath} --canonical'.format(**locals()) # cmdline = '{annotate_bed_py} {target_bed} --work-dir {cnf.work_dir} --reference {features_bed} ' \ # '--genome {cnf.genome.name} --sys-cnf {cnf.sys_cnf} --run-cnf {cnf.run_cnf} ' \ # '-o {output_fpath}'.format(**locals()) call(cnf, cmdline, output_fpath, stdout_to_outputfile=False) output_fpath = remove_comments(cnf, output_fpath) return output_fpath
def submit_job(cnf, cmdline, job_name, wait_for_steps=None, threads=1, output_fpath=None, stdout_to_outputfile=True, run_on_chara=False, **kwargs): prefix = str(cnf.project_name) + '_' if job_name: prefix += job_name + '_' prefix += datetime.now().strftime("%Y_%m_%d_%H_%M_%S") + '_' f, done_marker_fpath = make_tmpfile(cnf, prefix=prefix, suffix='.done') f, error_marker_fpath = make_tmpfile(cnf, prefix=prefix, suffix='.error') if isfile(done_marker_fpath): os.remove(done_marker_fpath) if isfile(error_marker_fpath): os.remove(error_marker_fpath) job_id = basename(splitext(done_marker_fpath)[0]) tx_output_fpath = None if output_fpath: if cnf.reuse_intermediate and verify_file(output_fpath, silent=True): info(output_fpath + ' exists, reusing') j = JobRunning(None, None, None, None, None, output_fpath=output_fpath, **kwargs) j.is_done = True return j if stdout_to_outputfile: tx_output_fpath = output_fpath + '.tx' if isfile(tx_output_fpath): os.remove(tx_output_fpath) cmdline += ' > ' + tx_output_fpath else: if isfile(output_fpath): os.remove(output_fpath) qsub = get_system_path(cnf, 'qsub', is_critical=True) bash = get_system_path(cnf, 'bash', is_critical=True) if cnf.log_dir: err_fpath = log_fpath = join(cnf.log_dir, job_id + '.log') else: fd, fpath = make_tmpfile(cnf, suffix=job_id + '.log', text=True) err_fpath = log_fpath = fpath queue = cnf.queue runner_script = adjust_system_path(cnf.qsub_runner) verify_file(runner_script, is_critical=True, description='qsub_runner') hold_jid_line = '-hold_jid ' + ','.join(wait_for_steps or ['_']) mem = threads * 15 priority = 0 if cnf.qsub_priority: priority = cnf.qsub_priority extra_qsub_opts = '' if run_on_chara and is_us(): extra_qsub_opts += '-l h="chara|rask"' cmdline = cmdline.replace('"', '\\"').replace('\\\\"', '\\"') qsub_cmdline = ( '{qsub} -pe smp {threads} {extra_qsub_opts} -S {bash} -q {queue} -p {priority} ' '-j n -o {log_fpath} -e {err_fpath} {hold_jid_line} ' '-N {job_id} {runner_script} {done_marker_fpath} {error_marker_fpath} "{cmdline}"'.format(**locals())) info('Submitting job ' + job_id) info(qsub_cmdline) job = JobRunning(job_id, log_fpath, qsub_cmdline, done_marker_fpath, error_marker_fpath, output_fpath=output_fpath, tx_output_fpath=tx_output_fpath, stdout_to_outputfile=stdout_to_outputfile, **kwargs) call(cnf, qsub_cmdline, silent=True) return job
def _intersect_with_tricky_regions(cnf, selected_bed_fpath, sample): info() info('Detecting problematic regions for ' + sample) bed_filenames = [fn + '.bed.gz' for fn in tricky_regions_fnames_d.keys()] merged_bed_fpaths = [ join(cnf.genome.tricky_regions, 'merged', bed_filename) for bed_filename in bed_filenames ] info('Intersecting BED ' + selected_bed_fpath + ' using BED files with tricky regions') intersection_fpath = join( cnf.work_dir, splitext_plus(basename(selected_bed_fpath))[0] + '_tricky_vcf_bed.intersect') if not cnf.reuse_intermediate or not verify_file( intersection_fpath, silent=True, is_critical=False): bedtools = get_system_path(cnf, 'bedtools') cmdline = bedtools + ' intersect -header -a ' + selected_bed_fpath + ' -b ' + ' '.join( merged_bed_fpaths) + ' -wo -filenames' call(cnf, cmdline, output_fpath=intersection_fpath, exit_on_error=False) return intersection_fpath
def _get_depth_for_each_variant(cnf, var_by_site, clipped_gz_vcf_fpath, bed_fpath, bam_fpath): # http://www.1000genomes.org/faq/what-depth-coverage-your-phase1-variants # bedtools intersect -a oncomine.vcf -b Exons.az_key.bed -header > oncomine.az_key.vcf # /opt/az/local/tabix/tabix-0.2.6/bgzip oncomine.az_key.vcf # /opt/az/local/tabix/tabix-0.2.6/tabix -h -p vcf oncomine.az_key.vcf.gz # samtools view -b TRF004223.sorted.bam -L Exons.az_key.bed | bedtools genomecov -ibam stdin -bg > coverage.bg # bedtools intersect -a oncomine.az_key.vcf.gz -b coverage.bg -wa | cut -f1,2,4,5,8,11,12,13,14 > oncomine.az_key.depth_numbers.vcf sambamba = get_system_path(cnf, join(get_ext_tools_dirname(), 'sambamba'), is_critical=True) bedtools = get_system_path(cnf, 'bedtools') info() info('Depth of coverage for regions in BED ' + bed_fpath) cov_bg = join(cnf.work_dir, 'coverage.bg') cmdline = '{sambamba} view -f bam -t {cnf.threads} -L {bed_fpath} {bam_fpath} | {bedtools} genomecov -ibam stdin -bg'.format( **locals()) call(cnf, cmdline, output_fpath=cov_bg, exit_on_error=False) info() info('Intersecting depth regions with VCF ' + clipped_gz_vcf_fpath) vcf_depth_numbers_fpath = join(cnf.work_dir, 'vcf_bg.intersect') if not cnf.reuse_intermediate or not verify_file( vcf_depth_numbers_fpath, silent=True, is_critical=False): cmdline = '{bedtools} intersect -a {clipped_gz_vcf_fpath} -b {cov_bg} -wao'.format( **locals()) res = call(cnf, cmdline, output_fpath=vcf_depth_numbers_fpath, exit_on_error=False) # if res != oncomine_depth_numbers_fpath: # info() # info('Trying with uncompressed VCF') # cmdline = 'gunzip {vcf_fpath} -c | {bedtools} intersect -a - -b {cov_bg} -wao | cut -f1,2,4,5,8,11,12,13,14,15'.format(**locals()) # call(cnf, cmdline, output_fpath=oncomine_depth_numbers_fpath) depths_per_var = defaultdict(list) with open(vcf_depth_numbers_fpath) as f: for l in f: # 1,2,4,5,8,11,12,13,14,15,16,17,18,19,20 # c,p,r,a,f,ch,st,en,ge,ex,st,ft,bt,de,ov fs = l.replace('\n', '').split('\t') chrom, pos, _, ref, alt = fs[:5] depth, overlap = fs[-2:] var = var_by_site.get((chrom, pos, ref, alt)) if var and depth != '.': depth, overlap = int(depth), int(overlap) for i in range(overlap): depths_per_var[(chrom, pos, ref, alt)].append(depth) # Getting average depth of coverage of each variant (exactly for those parts that were in BED) depth_by_var = { var: (sum(depths) / len(depths)) if len(depths) != 0 else None for var, depths in depths_per_var.iteritems() } return depth_by_var
def add_project_to_exac(cnf): info('Adding project to ExAC database') exac_venv_pythonpath = join(exac_venv_dir, 'bin', 'python') if is_local(): exac_venv_pythonpath = 'python' cmdline = exac_venv_pythonpath + ' ' + join(exac_code_dir, 'manage.py') + ' ' + 'add_project' + \ ' ' + cnf.project_name + ' ' + cnf.genome.name call(cnf, cmdline)
def get_padded_bed_file(cnf, bed, genome, padding): info('Making bed file for padded regions...') bedtools = get_system_path(cnf, 'bedtools') cmdline = '{bedtools} slop -i {bed} -g {genome} -b {padding}'.format( **locals()) output_fpath = intermediate_fname(cnf, bed, 'padded') call(cnf, cmdline, output_fpath) return output_fpath
def calculate_coverage_use_grid(cnf, samples, output_dirpath): assert len(samples) > 0 sambamba = get_system_path(cnf, join(get_ext_tools_dirname(), 'sambamba'), is_critical=True) chr_len_fpath = get_chr_len_fpath(cnf) jobs_to_wait = [] for sample in samples: sample_output_dirpath = join(output_dirpath, sample.name) safe_mkdir(sample_output_dirpath) for chrom in chromosomes: info('Processing chromosome ' + chrom) avg_cov_output_fpath = join(output_dirpath, chrom + '.txt.gz') sample_output_fpaths = [ join(output_dirpath, sample.name, chrom + '.txt.gz') for sample in samples ] sample_names = ','.join(sample.name for sample in samples) chrom_bams = [] for sample in samples: if not verify_file(sample.bam): err('BAM for ' + sample.name + ' is not exist!') continue output_bam_fpath = join( cnf.work_dir, basename(sample.name) + '_' + str(chrom) + '.bam') cmdline = '{sambamba} slice {sample.bam} {chrom}'.format( **locals()) call(cnf, cmdline, output_fpath=output_bam_fpath) if verify_file(output_bam_fpath): chrom_bams.append(output_bam_fpath) bam_fpaths = ','.join(chrom_bams) if cnf.reuse_intermediate and verify_file(avg_cov_output_fpath, silent=True) and \ all(verify_file(output_fpath, silent=True) for output_fpath in sample_output_fpaths): info(avg_cov_output_fpath + ' exists, reusing') else: j = _submit_region_cov(cnf, cnf.work_dir, chrom, bam_fpaths, sample_names, output_dirpath, chr_len_fpath) if j and not j.is_done: jobs_to_wait.append(j) info() if len(jobs_to_wait) >= cnf.threads: info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...') jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait) jobs_to_wait = [] elif not jobs_to_wait: info('No jobs to submit.') if jobs_to_wait: wait_for_jobs(cnf, jobs_to_wait)
def intersect_bed(cnf, bed1, bed2): bed1_fname, _ = splitext_plus(basename(bed1)) bed2_fname, _ = splitext_plus(basename(bed2)) output_fpath = join(cnf['work_dir'], bed1_fname + '__' + bed2_fname + '.bed') bedtools = get_system_path(cnf, 'bedtools') cmdline = '{bedtools} intersect -u -a {bed1} -b {bed2}'.format(**locals()) call(cnf, cmdline, output_fpath, verify_output_not_empty=False) return output_fpath
def add_project_files_to_jbrowse(cnf, bcbio_structure): genome = cnf.genome.name jbrowse_data_path, _, _ = set_folders(genome) jbrowse_dirpath = join(jbrowse_data_path, 'tracks') jbrowse_project_dirpath = join(jbrowse_dirpath, bcbio_structure.project_name) safe_mkdir(jbrowse_project_dirpath) jbrowse_tracks_fpath = join(jbrowse_data_path, 'tracks.conf') vcf_fpath_by_sample = None caller = bcbio_structure.variant_callers.get('vardict') or \ bcbio_structure.variant_callers.get('vardict-java') if caller: vcf_fpath_by_sample = caller.get_filt_vcf_by_sample() for sample in bcbio_structure.samples: if sample.bam: index_bam(cnf, sample.bam, use_grid=True) for sample in bcbio_structure.samples: if all(isfile(join(jbrowse_project_dirpath, sample.name + ext)) for ext in ['.bam', '.bam.bai', '.vcf.gz', '.vcf.gz.tbi', '.bigwig'])\ and check_tracks_in_configs(sample.name, bcbio_structure.project_name, jbrowse_tracks_fpath, vcf_fpath_by_sample): info(sample.name + ' was exported to jBrowse previously.') continue vcf_link = None if vcf_fpath_by_sample: vcf_fpath = vcf_fpath_by_sample[ sample.name] if sample.name in vcf_fpath_by_sample else None if vcf_fpath and verify_file(vcf_fpath): vcf_link = create_jbrowse_symlink(genome, bcbio_structure.project_name, sample.name, vcf_fpath) if not verify_file(vcf_fpath + '.tbi'): cmdline = '{tabix} {vcf_fpath}'.format(**locals()) call(cnf, cmdline, exit_on_error=False) create_jbrowse_symlink(genome, bcbio_structure.project_name, sample.name, vcf_fpath + '.tbi') if sample.bam: bam_link = create_jbrowse_symlink(genome, bcbio_structure.project_name, sample.name, sample.bam) create_jbrowse_symlink(genome, bcbio_structure.project_name, sample.name, sample.bam + '.bai') bigwig_link = create_jbrowse_symlink( genome, bcbio_structure.project_name, sample.name, splitext(sample.bam)[0] + '.bigwig') print_sample_tracks_info(sample.name, bcbio_structure.project_name, trunc_symlink(bam_link), trunc_symlink(bigwig_link), trunc_symlink(vcf_link), jbrowse_tracks_fpath)
def vcf_one_per_line(cnf, vcf_fpath): info('Converting VCF to one-effect-per-line...') oneperline_vcf_fpath = intermediate_fname(cnf, vcf_fpath, 'opl') vcfoneperline_cmline = get_script_cmdline(cnf, 'perl', join('ext_tools', 'vcfOnePerLine.pl')) call(cnf, vcfoneperline_cmline, oneperline_vcf_fpath, stdin_fpath=vcf_fpath, exit_on_error=False) info() if not verify_file(oneperline_vcf_fpath): critical('Error: vcf_one_per_line didn\'t generate output file.') return oneperline_vcf_fpath
def convert_to_bigwig(bedgraph_fpath, cnf, chr_len_fpath, bw_fpath): try: with file_transaction(cnf.work_dir, bw_fpath) as tx_fpath: cmdl = get_system_path(cnf, join(get_ext_tools_dirname(), 'bedGraphToBigWig'), is_critical=True) cmdl += ' ' + bedgraph_fpath + ' ' + chr_len_fpath + ' ' + tx_fpath call(cnf, cmdl, exit_on_error=True) finally: os.remove(bedgraph_fpath) return bw_fpath
def total_merge_bed(cnf, bed_fpath): bedops = get_system_path(cnf, 'bedops') if bedops: cmdline = '{bedops} --merge {bed_fpath}'.format(**locals()) output_fpath = intermediate_fname(cnf, bed_fpath, 'total_merged') call(cnf, cmdline, output_fpath) return output_fpath else: bedtools = get_system_path(cnf, 'bedtools') cmdline = '{bedtools} merge -i {bed_fpath}'.format(**locals()) output_fpath = intermediate_fname(cnf, bed_fpath, 'total_merged') call(cnf, cmdline, output_fpath) return output_fpath
def group_and_merge_regions_by_gene(cnf, bed_fpath, keep_genes=False): output_fpath = intermediate_fname(cnf, bed_fpath, 'grp_mrg') group_merge_bed_py = get_system_path( cnf, 'python', join('tools', 'bed_processing', 'group_and_merge_by_gene.py')) cmdline = '{group_merge_bed_py} {bed_fpath}'.format(**locals()) if not keep_genes: cmdline += ' | grep -vw Gene' call(cnf, cmdline, output_fpath) return output_fpath
def main(): cnf = read_opts_and_cnfs(extra_opts=[ (['--bam'], dict(dest='bam', help='path to the BAM file')), (['--bed', '--capture', '--amplicons'], dict(dest='bed', help='capture panel/amplicons')), (['--pcr'], dict( dest='pcr', action='store_true', help='deduplication was not perfomed, thus do not try to dedup')), ], required_keys=['bam'], file_keys=['bam', 'bed'], key_for_sample_name='bam', proc_name=BCBioStructure.qualimap_name) index_bam(cnf, cnf.bam) info('Using alignment ' + cnf.bam) bed = '' if cnf.bed: bed = ' -gff ' + cnf.bed + ' ' info('Using amplicons/capture panel ' + cnf.bed) qualimap = get_system_path(cnf, 'qualimap', is_critical=True) if not qualimap: critical('Cannot find qualimap') info() mem_cmdl = '' mem_m = get_qualimap_max_mem(cnf.bam) mem = str(int(mem_m)) + 'M' mem_cmdl = ' --java-mem-size=' + mem cmdline = ( '{qualimap} bamqc --skip-duplicated -nt ' + str(cnf.threads) + mem_cmdl + ' -nr 5000 ' '-bam {cnf.bam} -outdir {cnf.output_dir} {bed} -c -gd HUMAN').format( **locals()) report_fpath = join(cnf.output_dir, 'qualimapReport.html') call(cnf, cmdline, output_fpath=report_fpath, stdout_to_outputfile=False, env_vars=dict(DISPLAY=None)) info('Qualimap report: ' + str(report_fpath))
def bam_to_bed(cnf, bam_fpath, to_gzip=True): info( 'Converting the BAM to BED to save some memory.' ) # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/ bam_bed_fpath = splitext_plus(bam_fpath)[0] + ('.bed.gz' if to_gzip else '.bed') bedtools = get_system_path(cnf, 'bedtools') gzip = get_system_path(cnf, 'gzip') cmdline = '{bedtools} bamtobed -i {bam_fpath}'.format(**locals()) cmdline += ' | {gzip}'.format(**locals()) if to_gzip else '' call(cnf, cmdline, output_fpath=bam_bed_fpath, verify_output_not_empty=False) return bam_bed_fpath
def call_sambamba(cnf, cmdl, bam_fpath, output_fpath=None, sambamba=None, use_grid=False, command_name='', sample_name=None, silent=False, stdout_to_outputfile=True): sambamba = sambamba or get_system_path( cnf, join(get_ext_tools_dirname(), 'sambamba'), is_critical=True) sample_name = sample_name or basename(bam_fpath).split('.')[0] if use_grid: grid_sambabma = get_script_cmdline( cnf, 'python', join('tools', 'bed_processing', 'sambamba.py')) cmdl = cmdl.replace(' "', ' \'\"__QUOTE__') cmdl = cmdl.replace('" ', '__QUOTE__\"\' ') grid_cmdl = grid_sambabma + ' ' + bam_fpath + ' ' + sambamba + ' ' + cmdl job_name = command_name + '_' + sample_name j = submit_job(cnf, grid_cmdl, job_name=job_name, output_fpath=output_fpath, stdout_to_outputfile=stdout_to_outputfile) info() return j else: index_bam(cnf, bam_fpath, sambamba=sambamba) cmdl = sambamba + ' ' + cmdl stderr_dump = [] res = call(cnf, cmdl, output_fpath=output_fpath, exit_on_error=False, stderr_dump=stderr_dump, stdout_to_outputfile=stdout_to_outputfile, silent=silent, print_stderr=not silent) if not res: for l in stderr_dump: if 'sambamba-view: BAM index file (.bai) must be provided' in l: if isfile(isfile(bam_fpath + '.bai')): info('Removing .bai and re-indexing...') os.remove(bam_fpath + '.bai') index_bam(cnf, bam_fpath, sambamba) res = call(cnf, cmdl, output_fpath=output_fpath) return res
def evaluate_capture(cnf, project_dirpaths): cmdline = get_script_cmdline(cnf, 'python', join('tools', 'evaluate_capture_target.py'), is_critical=True) project_dirpaths = ' '.join(project_dirpaths) cmdline += ' --genome {cnf.genome.name} --project-name {cnf.project_name} {project_dirpaths} '.format( **locals()) cmdline += ' --exac-only-filtering --tricky-regions ' if cnf.bed: cmdline += ' --bed ' + cnf.bed depth_thresholds = [10, 25, 50, 100] for min_depth in depth_thresholds: cmdline += ' --min-depth {min_depth}'.format(**locals()) call(cnf, cmdline)
def merge_vcfs(cnf, vcf_fpath_by_sname, combined_vcf_fpath): if cnf.reuse_intermediate and isfile( combined_vcf_fpath + '.gz') and verify_vcf(combined_vcf_fpath + '.gz'): info(combined_vcf_fpath + '.gz exists, reusing') return combined_vcf_fpath + '.gz' bcftools = get_system_path(cnf, 'bcftools') if not bcftools: info('bcftools is not found, skipping merging VCFs') return None cmdl = '{bcftools} merge --force-samples '.format(**locals()) for sample, vcf_fpath in vcf_fpath_by_sname.iteritems(): if vcf_fpath: cmdl += ' ' + vcf_fpath + ' ' cmdl += ' -o ' + combined_vcf_fpath res = call(cnf, cmdl, output_fpath=combined_vcf_fpath, stdout_to_outputfile=False, exit_on_error=False) if res: info('Joined VCFs, saved into ' + combined_vcf_fpath) if isfile(combined_vcf_fpath + '.tx.idx'): try: os.remove(combined_vcf_fpath + '.tx.idx') except OSError: info() return bgzip_and_tabix(combined_vcf_fpath) else: warn('Could not join VCFs') return None
def make_circos_plot(cnf, bcbio_structure): circos_fpath = join(bcbio_structure.date_dirpath, 'circos.html') mutations_fpaths = get_mutations_fpaths(bcbio_structure) if not mutations_fpaths: err('File with Vardict results does not exist. Circos plot cannot be created.' ) return if not bcbio_structure.seq2c_fpath: err('File with Seq2C results does not exist. Circos plot cannot be created.' ) return cmdl = 'circos --genome ' + cnf.genome.name + ' -o ' + bcbio_structure.date_dirpath + \ ' --mutations ' + ','.join(mutations_fpaths) + ' --seq2c ' + bcbio_structure.seq2c_fpath + ' ' + bcbio_structure.bcbio_project_dirpath call(cnf, cmdl, exit_on_error=False, silent=False) if verify_file(circos_fpath): return circos_fpath
def markdup_bam(cnf, in_bam_fpath, bammarkduplicates=None): """Perform non-stream based deduplication of BAM input files using biobambam. """ if not bammarkduplicates: bammarkduplicates = get_system_path(cnf, 'bammarkduplicates') if not bammarkduplicates: warn('No biobambam bammarkduplicates, can\'t mark duplicates.') return None out_bam_fpath = add_suffix(in_bam_fpath, 'markdup') tmp_fpath = join(cnf.work_dir, splitext_plus(basename(in_bam_fpath))[0] + '_markdup') safe_mkdir(dirname(tmp_fpath)) cmdline = ( '{bammarkduplicates} tmpfile={tmp_fpath} I={in_bam_fpath} O={out_bam_fpath}' ).format(**locals()) res = call(cnf, cmdline, output_fpath=out_bam_fpath, stdout_to_outputfile=False, exit_on_error=False) if res: return out_bam_fpath else: return None
def picard_ins_size_hist(cnf, sample, bam_fpath, output_dir): picard = get_system_path(cnf, 'java', 'picard') if picard: safe_mkdir(dirname(sample.picard_ins_size_hist_txt_fpath)) safe_mkdir(dirname(sample.picard_ins_size_hist_pdf_fpath)) info('Picard ins size hist for "' + basename(bam_fpath) + '"') cmdline = '{picard} CollectInsertSizeMetrics' \ ' I={bam_fpath}' \ ' O={sample.picard_ins_size_hist_txt_fpath}' \ ' H={sample.picard_ins_size_hist_pdf_fpath}' \ ' VALIDATION_STRINGENCY=LENIENT' cmdline = cmdline.format(**locals()) call(cnf, cmdline, output_fpath=sample.picard_ins_size_hist_txt_fpath, stdout_to_outputfile=False, exit_on_error=False)
def index_bam(cnf, bam_fpath, sambamba=None, samtools=None, use_grid=False): if use_grid: return index_bam_grid(cnf, bam_fpath, sambamba) sambamba = sambamba or get_system_path( cnf, join(get_ext_tools_dirname(), 'sambamba'), is_critical=True) indexed_bam = bam_fpath + '.bai' if not isfile(indexed_bam) or getmtime(indexed_bam) < getmtime(bam_fpath): info('Indexing BAM, writing ' + indexed_bam + '...') cmdline = '{sambamba} index {bam_fpath}'.format(**locals()) res = call(cnf, cmdline, exit_on_error=False) if not isfile( indexed_bam) or getmtime(indexed_bam) < getmtime(bam_fpath): samtools = samtools or get_system_path(cnf, 'samtools') cmdline = '{samtools} index {bam_fpath}'.format(**locals()) call(cnf, cmdline) else: debug('Actual "bai" index exists.')
def generate_combined_bam(cnf, bam_fpaths, temp_combined_bam_fpath, combined_bam_fpath): info('Combining %s bams into %s' % (len(bam_fpaths), combined_bam_fpath)) if cnf.reuse_intermediate and verify_file(combined_bam_fpath, silent=True): return combined_bam_fpath # sorted_bam_paths = sorted(bam_fpaths, key=lambda bam_path: int(bam_path_to_dict(bam_path)['pos'])) read_group_ids = map(bam_path_to_read_group_id, bam_fpaths) read_groups = [{'ID': read_group_id, 'SM': 0} for read_group_id in read_group_ids] out_bam = None for bam_fpath in bam_fpaths: try: ibam = pysam.AlignmentFile(bam_fpath, 'rb') if out_bam is None: header = { 'HD': {'VN': '1.4'}, 'SQ': ibam.header['SQ'], 'RG': read_groups, } out_bam = pysam.AlignmentFile(temp_combined_bam_fpath, 'wb', header=header) # iterate over the reads rg_tag = (('RG', bam_path_to_read_group_id(bam_fpath)), ) for r in ibam: r.tags = rg_tag out_bam.write(r) ibam.close() except (IOError, ValueError) as e: err('ERROR on file %s: %s', bam_fpath, e) if out_bam is not None: out_bam.close() sambamba = get_system_path(cnf, join(get_ext_tools_dirname(), 'sambamba'), is_critical=True) cmdline = '{sambamba} sort -t {cnf.threads} {temp_combined_bam_fpath} -o {combined_bam_fpath}'.format(**locals()) call(cnf, cmdline) cmdline = '{sambamba} index {combined_bam_fpath}'.format(**locals()) call(cnf, cmdline) info(combined_bam_fpath + ' saved!')
def igvtools_index(cnf, vcf_fpath): igvtools = get_system_path(cnf, 'igvtools') if not igvtools: err('Warning: no igvtools found, cannot index VCF.') return None if igvtools.endswith('.jar'): igvtools = get_java_tool_cmdline(cnf, 'igvtools') if igvtools is None: err('Warning: no jar igvtools found, cannot index VCF.') return None cmdline = '{igvtools} index {vcf_fpath}'.format(**locals()) call(cnf, cmdline, exit_on_error=False) if exists('igv.log'): try: os.remove('igv.log') except OSError: pass return vcf_fpath + '.idx'
def run_targqc(cnf, bam_by_sample, bed_fpath, output_dirpath): info('Running TargQC for downsampled BAMs') targqc = get_script_cmdline(cnf, 'python', 'targqc.py', is_critical=True) targqc_work_dir = join(cnf.work_dir, 'TargQC') targqc_log_dir = join(cnf.log_dir, 'TargQC') safe_mkdir(targqc_work_dir) safe_mkdir(targqc_log_dir) bed_cmdl = '' if bed_fpath: bed_cmdl = '--bed ' + bed_fpath bam_cmdl = ' '.join(bam_fpath + ',' + sname for sname, bam_fpath in bam_by_sample.items()) cmdl = '{targqc} --sys-cnf {cnf.sys_cnf} {bam_cmdl} {bed_cmdl} ' \ '--work-dir {targqc_work_dir} --log-dir {targqc_log_dir} --project-name {cnf.project_name} ' \ '-o {output_dirpath} --genome {cnf.genome.name}'.format(**locals()) if cnf.reuse_intermediate: cmdl += ' --reuse' call(cnf, cmdl)
def run_fastq(cnf, sample_name, l_r_fpath, r_r_fpath, output_dirpath, downsample_to=1e7): fastqc = get_system_path(cnf, 'fastqc', is_critical=True) java = get_system_path(cnf, 'java', is_critical=True) if downsample_to: info('Downsampling to ' + str(downsample_to)) l_fpath, r_fpath = downsample(cnf, sample_name, l_r_fpath, r_r_fpath, downsample_to, output_dir=cnf.work_dir) # Joining fastq files to run on a combination fastqc_fpath = join(cnf.work_dir, sample_name + '.fq') info('Combining fastqs, writing to ' + fastqc_fpath) with open(fastqc_fpath, 'w') as out: out.write(open_gzipsafe(l_r_fpath).read()) out.write(open_gzipsafe(r_r_fpath).read()) # Running FastQC info('Running FastQC') tmp_dirpath = join(cnf.work_dir, 'FastQC_' + sample_name + '_tmp') safe_mkdir(tmp_dirpath) cmdline = '{fastqc} --dir {tmp_dirpath} --extract -o {output_dirpath} -f fastq -j {java} {fastqc_fpath}'.format( **locals()) call(cnf, cmdline) # Cleaning and getting report sample_fastqc_dirpath = join(output_dirpath, sample_name + '.fq_fastqc') if isfile(sample_fastqc_dirpath + '.zip'): os.remove(sample_fastqc_dirpath + '.zip') fastqc_html_fpath = join(sample_fastqc_dirpath, 'fastqc_report.html') verify_file(fastqc_html_fpath, is_critical=True) return sample_fastqc_dirpath
def vcf_merge(cnf, vcf_fpaths, combined_vcf_fpath): vcf_merge_cmdline = get_system_path(cnf, join('ext_tools', 'vcftools', 'scripts', 'vcf-merge')) if vcf_merge_cmdline is None: critical('No vcf_merge in path') cmdline = vcf_merge_cmdline + ' ' + ' '.join(vcf_fpaths) perl_module_dirpath = abspath(join(dirname(__file__), pardir, pardir, 'ext_modules', 'perl_modules')) os.environ['PERL5LIB'] = perl_module_dirpath res = call(cnf, cmdline, combined_vcf_fpath, exit_on_error=False) if not res: return None
def remove_dups_picard(cnf, bam_fpath): picard = get_system_path(cnf, 'java', 'picard') if not picard: critical('No picard in the system') info('Running picard dedup for "' + basename(bam_fpath) + '"') dup_metrics_txt = join(cnf.work_dir, 'picard_dup_metrics.txt') output_fpath = intermediate_fname(cnf, bam_fpath, 'pcd_dedup') cmdline = '{picard} MarkDuplicates' \ ' I={bam_fpath}' \ ' O={output_fpath}' \ ' METRICS_FILE={dup_metrics_txt}' \ ' REMOVE_DUPLICATES=True' \ ' VALIDATION_STRINGENCY=LENIENT' res = call(cnf, cmdline.format(**locals()), output_fpath=output_fpath, stdout_to_outputfile=False, exit_on_error=False) if res != output_fpath: # error occurred, try to correct BAM and restart warn('Picard deduplication failed for "' + basename(bam_fpath) + '". Fixing BAM and restarting Picard...') bam_fpath = _fix_bam_for_picard(cnf, bam_fpath) res = call(cnf, cmdline.format(**locals()), output_fpath=output_fpath, stdout_to_outputfile=False, exit_on_error=False) if res == output_fpath: dup_rate = _parse_picard_dup_report(dup_metrics_txt) assert dup_rate <= 1.0 or dup_rate is None, str(dup_rate) info('Duplication rate (picard): ' + str(dup_rate)) return output_fpath else: return None