def bgzip_and_tabix(fpath, reuse=False, tabix_parameters='', **kwargs): gzipped_fpath = join(fpath + '.gz') tbi_fpath = gzipped_fpath + '.tbi' if reuse and \ file_exists(gzipped_fpath) and (getctime(gzipped_fpath) >= getctime(fpath) if file_exists(fpath) else True) and \ file_exists(tbi_fpath) and getctime(tbi_fpath) >= getctime(gzipped_fpath): info('Actual compressed file and index exist, reusing') return gzipped_fpath info('Compressing and tabixing file, writing ' + gzipped_fpath + '(.tbi)') bgzip = which('bgzip') tabix = which('tabix') if not bgzip: err('Cannot index file because bgzip is not found') if not tabix: err('Cannot index file because tabix is not found') if not bgzip and not tabix: return fpath if isfile(gzipped_fpath): os.remove(gzipped_fpath) if isfile(tbi_fpath): os.remove(tbi_fpath) info('BGzipping ' + fpath) cmdline = '{bgzip} {fpath}'.format(**locals()) call_process.run(cmdline) info('Tabixing ' + gzipped_fpath) cmdline = '{tabix} {tabix_parameters} {gzipped_fpath}'.format(**locals()) call_process.run(cmdline) return gzipped_fpath
def sort_bed_gsort(input_bed_fpath, output_bed_fpath=None, work_dir=None, fai_fpath=None, genome=None): input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True) output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \ else intermediate_fname(work_dir, input_bed_fpath, 'sorted') debug('Sorting regions in ' + str(input_bed_fpath)) if can_reuse(output_bed_fpath, input_bed_fpath): debug(output_bed_fpath + ' exists, reusing') return output_bed_fpath if fai_fpath: fai_fpath = verify_file(fai_fpath) elif genome: fai_fpath = verify_file(ref.get_fai(genome)) else: critical('Either of fai_fpath or genome build name must be specified') with file_transaction(work_dir, output_bed_fpath) as tx: run('gsort {input_bed_fpath} {fai_fpath}'.format(**locals()), output_fpath=tx) return output_bed_fpath
def build_snps_panel(bcbio_projs=None, bed_files=None, output_dir=None, genome=None): selected_snps_file = join(output_dir, 'snps.bed') if can_reuse(selected_snps_file, bed_files): return selected_snps_file work_dir = safe_mkdir(join(output_dir, 'work')) log.info('Intersecting BED files for projects.') all_bed_files = set() for proj in bcbio_projs or []: if proj.coverage_bed: log.info(proj.project_name + ': selecting ' + proj.coverage_bed) all_bed_files.add(proj.coverage_bed) else: all_bed_files.add(proj.call) all_bed_files |= set(bed_files or []) overlapped_bed = join(work_dir, 'merged_bed_files.bed') log.info(f'BED files: {all_bed_files}, mergin, writing {overlapped_bed}') overlap_bed_files(all_bed_files, overlapped_bed) # Selecting SNPs from dbSNP dbsnp_file = get_dbsnp(genome) dbsnp_snps_file = join(work_dir, 'snps_in_merged_bed_files.bed') if not can_reuse(dbsnp_snps_file, [dbsnp_file, overlapped_bed]): cmdl = f'bedtools intersect -header -a {dbsnp_file} -b {overlapped_bed}' call_process.run(cmdl, dbsnp_snps_file) subset_bed_file = add_suffix(dbsnp_snps_file, 'subset') _make_snp_file(dbsnp_snps_file, genome, subset_bed_file) shutil.copyfile(subset_bed_file, selected_snps_file) return selected_snps_file
def cut(fpath, col_num, output_fpath=None): output_fpath = output_fpath or add_suffix(fpath, 'cut') if can_reuse(output_fpath, fpath): return output_fpath cmdline = 'cut -f' + ','.join(map(str, range(1, col_num + 1))) + ' ' + fpath + ' > ' + output_fpath call_process.run(cmdline, output_fpath=output_fpath) return output_fpath
def lift_over(fpath, from_genome, to_genome): chain_file = join(dirname(__file__), 'over.chain', f'{from_genome}To{to_genome.title()}.over.chain.gz') if not verify_file(chain_file): log.critical(f'Error: conversion from {from_genome} to {to_genome} is not supported!') out_fpath = add_suffix(fpath, to_genome) call_process.run(f'liftOver {fpath} {chain_file} {out_fpath} {out_fpath}.unMapped') return out_fpath
def get_padded_bed_file(work_dir, bed, padding, fai_fpath): genome_fpath = fai_fpath info('Making bed file for padded regions...') bedtools = which('bedtools') cmdline = '{bedtools} slop -i {bed} -g {genome_fpath} -b {padding}'.format(**locals()) output_fpath = intermediate_fname(work_dir, bed, 'padded') call_process.run(cmdline, output_fpath=output_fpath) return output_fpath
def cut(fpath, col_num, output_fpath=None): output_fpath = output_fpath or add_suffix(fpath, 'cut') if can_reuse(output_fpath, fpath): return output_fpath cmdline = 'cut -f' + ','.join(map(str, range( 1, col_num + 1))) + ' ' + fpath + ' > ' + output_fpath call_process.run(cmdline, output_fpath=output_fpath) return output_fpath
def get_padded_bed_file(work_dir, bed, padding, fai_fpath): genome_fpath = fai_fpath info('Making bed file for padded regions...') bedtools = which('bedtools') cmdline = '{bedtools} slop -i {bed} -g {genome_fpath} -b {padding}'.format( **locals()) output_fpath = intermediate_fname(work_dir, bed, 'padded') call_process.run(cmdline, output_fpath=output_fpath) return output_fpath
def overlap_bed_files(bed_files, output_bed_file): if can_reuse(output_bed_file, bed_files): return output_bed_file if len(bed_files) == 1: shutil.copy(bed_files.pop(), output_bed_file) return output_bed_file cmdl = 'bedops --intersect' + ''.join([' <(sort-bed ' + bf + ')' for bf in bed_files]) call_process.run(cmdl, output_bed_file) return output_bed_file
def intersect_bed(work_dir, bed1, bed2): bed1_fname, _ = splitext_plus(basename(bed1)) bed2_fname, _ = splitext_plus(basename(bed2)) output_fpath = join(work_dir, bed1_fname + '__' + bed2_fname + '.bed') if can_reuse(output_fpath, [bed1, bed2]): return output_fpath bedtools = which('bedtools') cmdline = '{bedtools} intersect -u -a {bed1} -b {bed2}'.format(**locals()) call_process.run(cmdline, output_fpath=output_fpath, checks=[call_process.file_exists_check]) return output_fpath
def call_sambamba(cmdl, bam_fpath, output_fpath=None, command_name='', no_index=False): if not no_index: index_bam(bam_fpath) sambamba = get_executable() run(sambamba + ' ' + cmdl, output_fpath=output_fpath) return output_fpath
def bam_to_bed(bam_fpath, to_gzip=True): debug('Converting the BAM to BED to save some memory.') # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/ bam_bed_fpath = splitext_plus(bam_fpath)[0] + ('.bed.gz' if to_gzip else '.bed') if can_reuse(bam_bed_fpath, bam_fpath): return bam_bed_fpath bedtools = which('bedtools') gzip = which('gzip') cmdline = '{bedtools} bamtobed -i {bam_fpath}'.format(**locals()) cmdline += ' | {gzip}'.format(**locals()) if to_gzip else '' call_process.run(cmdline, output_fpath=bam_bed_fpath) return bam_bed_fpath
def bam_to_bed(bam_fpath, to_gzip=True): debug( 'Converting the BAM to BED to save some memory.' ) # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/ bam_bed_fpath = splitext_plus(bam_fpath)[0] + ('.bed.gz' if to_gzip else '.bed') if can_reuse(bam_bed_fpath, bam_fpath): return bam_bed_fpath bedtools = which('bedtools') gzip = which('gzip') cmdline = '{bedtools} bamtobed -i {bam_fpath}'.format(**locals()) cmdline += ' | {gzip}'.format(**locals()) if to_gzip else '' call_process.run(cmdline, output_fpath=bam_bed_fpath) return bam_bed_fpath
def annotate_target(work_dir, target_bed, genome_build): output_fpath = intermediate_fname(work_dir, target_bed, 'ann') if can_reuse(output_fpath, target_bed): info(output_fpath + ' exists, reusing') return output_fpath bed_annotation = which('annotate_bed.py') if not bed_annotation: bed_annotation = which('bed_annotation') critical('Error: bed_annotation not found in PATH, please install `conda install -c vladsaveliev bed_annotation`.') cmdline = '{bed_annotation} {target_bed} -g {genome_build} -o {output_fpath}'.format(**locals()) run(cmdline, output_fpath, stdout_to_outputfile=False) output_fpath = clean_bed(output_fpath, work_dir) return output_fpath
def _slice_vcf_fn(work_dir, label, vcf_file, overlapped_bed): sliced_vcf_file = join(work_dir, label + '.sliced.vcf') if not can_reuse(sliced_vcf_file, [vcf_file]): run(f'bcftools view {vcf_file} --targets-file {overlapped_bed} -o {sliced_vcf_file}' ) # ann_vcf_file = join(work_dir, label + '.sliced.ann.vcf') # if not can_reuse(ann_vcf_file, [sliced_vcf_file]): # vcf_header = join(work_dir, label + '.vcf_header') # with open(vcf_header, 'w') as f: # f.write('##INFO=<ID=CHROM,Number=1,Type=String,Description="Region chromosome">\n') # f.write('##INFO=<ID=FROM,Number=1,Type=String,Description="Region start">\n') # f.write('##INFO=<ID=TO,Number=1,Type=String,Description="Region end">\n') # run(f'bcftools annotate -c CHROM,FROM,TO -a {overlapped_bed} {sliced_vcf_file} ' # f'-h {vcf_header} -o {ann_vcf_file}') return label, sliced_vcf_file
def annotate_target(work_dir, target_bed, genome_build): output_fpath = intermediate_fname(work_dir, target_bed, 'ann') if can_reuse(output_fpath, target_bed): info(output_fpath + ' exists, reusing') return output_fpath bed_annotation = which('annotate_bed.py') if not bed_annotation: bed_annotation = which('bed_annotation') critical( 'Error: bed_annotation not found in PATH, please install `conda install -c vladsaveliev bed_annotation`.' ) cmdline = '{bed_annotation} {target_bed} -g {genome_build} -o {output_fpath}'.format( **locals()) run(cmdline, output_fpath, stdout_to_outputfile=False) output_fpath = clean_bed(output_fpath, work_dir) return output_fpath
def index_bam(bam_fpath, sambamba=None, samtools=None): sambamba = sambamba or get_executable() indexed_bam = bam_fpath + '.bai' if not can_reuse(indexed_bam, cmp_f=bam_fpath, silent=True): cmdline = '{sambamba} index {bam_fpath}'.format(**locals()) res = run(cmdline, output_fpath=indexed_bam, stdout_to_outputfile=False, stdout_tx=False)
def ungzip_if_needed(cnf, fpath, silent=False): if fpath.endswith('.gz'): fpath = fpath[:-3] if not file_exists(fpath) and file_exists(fpath + '.gz'): gz_fpath = fpath + '.gz' cmdline = 'gunzip -c {gz_fpath}'.format(**locals()) res = run(cmdline, output_fpath=fpath) if not silent: info() if not res: return None return fpath
def _calculate(bam_file, work_dir, genome_fasta_file, min_depth): """Calculate coverage in parallel using samtools depth through goleft. samtools depth removes duplicates and secondary reads from the counts: if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue; """ output_prefix = os.path.join(work_dir, bam_samplename(bam_file)) callability_annotation_file = output_prefix + '.callable.bed' if not can_reuse(callability_annotation_file, bam_file): info(f'Calculating coverage at {bam_file}') run(f'goleft depth --q 1 --mincov {min_depth} --reference {genome_fasta_file} --ordered' f' --prefix {output_prefix} {bam_file}') callable_file = output_prefix + '.callable.CALLABLE.bed' if not can_reuse(callable_file, callability_annotation_file): with file_transaction(None, callable_file) as tx: pybedtools.BedTool(callability_annotation_file)\ .filter(lambda x: x.name == 'CALLABLE')\ .saveas(tx) return callable_file
def build_tree(run): info('Writing fasta to ' + run.fasta_file_path()) samples = [s for p in run.projects for s in p.samples] with open(run.fasta_file_path(), 'w') as fhw: for s in samples: snps_by_rsid = s.snps_from_run(run) fhw.write('>' + s.long_name() + '\n') fhw.write(''.join(snps_by_rsid[loc.rsid].get_gt() for loc in run.locations.all()) + '\n') info('All fasta saved to ' + run.fasta_file_path()) info() info('Building phylogeny tree using prank...') prank_out = join(run.work_dir_path(), splitext(basename(run.fasta_file_path()))[0]) call_process.run('prank -d=' + run.fasta_file_path() + ' -o=' + prank_out + ' -showtree') if not verify_file(prank_out + '.best.dnd'): critical('Prank failed to run') os.rename(prank_out + '.best.dnd', run.tree_file_path()) os.remove(prank_out + '.best.fas') return run.fasta_file_path()
def _vardict_pileup_sample(sample, work_dir, output_dir, genome_fasta_file, snp_file): vardict_snp_vars = join(work_dir, sample.name + '_vars.txt') vcf_file = join(output_dir, sample.name + '.vcf') if can_reuse(vardict_snp_vars, [sample.bam, snp_file]) and can_reuse( vcf_file, vardict_snp_vars): return vcf_file vardict_exec = which('vardict') if not vardict_exec: critical( 'Error: vardict is not in PATH. Please install it with `conda install -c bioconda vardict`' ) vardict_bin_dir = dirname(vardict_exec) # Run VarDict index_bam(sample.bam) cmdl = '{vardict_exec} -G {genome_fasta_file} -N {sample.name} -b {sample.bam} -p -D {snp_file}'.format( **locals()) call_process.run(cmdl, output_fpath=vardict_snp_vars) # Complex variants might have a shifted start positions with respect to rsid so we are # associating starts with rsid for futher snp identification ann_by_var = defaultdict(list) with open(vardict_snp_vars) as f: for l in f: fs = l.split('\t') ann, chrom, start = fs[1], fs[2], fs[3] ann_by_var[(chrom, start)] = ann info() info('Converting to VCF') work_vcf_file = join(work_dir, sample.name + '_vars.vcf') cmdl = ( 'cut -f-34 ' + vardict_snp_vars + ' | awk -F"\\t" -v OFS="\\t" \'{for (i=1;i<=NF;i++) { if ($i=="") $i="0" } print $0 }\'' ' | ' + join('teststrandbias.R') + ' | ' + join('var2vcf_valid.pl') + ' -A -f 0.2' + '') call_process.run(cmdl, output_fpath=work_vcf_file) # Fix non-call records with empty REF and LAT, and "NA" values assigned to INFO's SN and HICOV fixed_vcf_file = add_suffix(work_vcf_file, 'fixed') info('Fixing VCF for parsing, writing to ' + fixed_vcf_file) with open(work_vcf_file) as inp, open(fixed_vcf_file, 'w') as out_f: for l in inp: if l.startswith('#'): out_f.write(l) else: fs = l.split('\t') chrom, pos, _, ref, alt = fs[0], int( fs[1]), fs[2], fs[3], fs[4] if alt in ['.', '']: fs[4] = fs[3] = _get_fasta_ref( genome_fasta_file, chrom, pos) # Reading the reference allele from fasta l = '\t'.join(fs) l = l.replace('=NA;', '=.;') l = l.replace('=;', '=.;') l = l.replace('TYPE=0', 'TYPE=REF') out_f.write(l) assert verify_file(fixed_vcf_file) info('Annotating VCF with gene names and rsIDs') ann_vcf_file = add_suffix(fixed_vcf_file, 'ann') with open(fixed_vcf_file) as f, open(ann_vcf_file, 'w') as out: vcf_reader = vcf.Reader(f) vcf_writer = vcf.Writer(out, vcf_reader) for rec in vcf_reader: ann = ann_by_var[(rec.CHROM, str(rec.POS))] rec.ID = ann.split('|')[0] rec.INFO['ANNOTATION'] = ann vcf_writer.write_record(rec) assert verify_file(ann_vcf_file), ann_vcf_file ann_hdr_vcf_file = add_suffix(ann_vcf_file, 'hdr') cmdl = 'bcftools annotate -h <(echo ' \ '\'##INFO=<ID=ANNOTATION,Number=1,Type=String,Description="rsid|gene_name|ref|alts">\') ' + \ bgzip_and_tabix(ann_vcf_file) call_process.run(cmdl, output_fpath=ann_hdr_vcf_file) debug('Renaming ' + ann_hdr_vcf_file + ' -> ' + vcf_file) os.rename(ann_hdr_vcf_file, vcf_file) return vcf_file