def count_bam( bamfile: str, bed_file: str, output: str, tmpdir: str, output_dir: str, chrom_sizes: str, use_fast_count: bool = True, verbose: bool = True, ): local_bamfile = join(tmpdir, basename(bamfile)) local_bed_file = join(tmpdir, basename(bed_file)) local_chrom_sizes = join(tmpdir, basename(chrom_sizes)) local_chrom_sizes_bed = ".".join([local_chrom_sizes, "bed"]) chrom_sizes_bed = ".".join([chrom_sizes, "bed"]) local_output = join(tmpdir, basename(output)) if use_fast_count: temp_output = local_output + ".temp_sort_order" return script( f""" #!/bin/bash awk 'FNR==NR {{x2[$1] = $0; next}} $1 in x2 {{print x2[$1]}}' {local_chrom_sizes} <(samtools view -H {local_bamfile} | grep SQ | cut -f 2 | cut -c 4- ) > {temp_output}; bedtools sort -faidx {temp_output} -i {local_bed_file} | bedtools coverage -g {temp_output} -counts -a stdin -b {local_bamfile} | awk '{{print $1"\t"$2"\t"$3"\t"$NF}}' | bedtools sort -faidx {local_chrom_sizes} -i stdin > {local_output}; rm {temp_output} """, inputs=[ File(bamfile).stage(File(local_bamfile)), File(chrom_sizes).stage(File(local_chrom_sizes)), File(bed_file).stage(File(local_bed_file)), ], outputs={ "file": File(join(output_dir, basename(output))).stage(File(local_output)), "path": join(output_dir, basename(output)), }, ) else: return script( f""" #!/bin/bash bedtools bamtobed -i {local_bamfile} | cut -f 1-3 | bedtools intersect -wa -a stdin -b {local_chrom_sizes_bed} | bedtools sort -i stdin -faidx {local_chrom_sizes} | bedtools coverage -g {local_chrom_sizes} -counts -sorted -a {local_bed_file} -b stdin | awk '{{print $1"\t"$2"\t"$3"\t"$NF}}' > {local_output} """, inputs=[ File(bamfile).stage(File(local_bamfile)), File(chrom_sizes).stage(File(local_chrom_sizes)), File(chrom_sizes_bed).stage(File(local_chrom_sizes_bed)), File(bed_file).stage(File(local_bed_file)), ], outputs={ "file": File(join(output_dir, basename(output))).stage(File(local_output)), "path": join(output_dir, basename(output)), }, )
def count_tagalign_total(tagalign, tmpdir): local_tagalign = join(tmpdir, basename(tagalign)) return script( f""" zcat local_tagalign | grep -E 'chr[1-9]|chr1[0-9]|chr2[0-2]|chrX|chrY' | wc -l """, inputs=[File(tagalign).stage(local_tagalign)], )
def count_bam_mapped(bam_file, tmpdir): # Counts number of reads in a BAM file WITHOUT iterating. Requires that the BAM is indexed if bam_file.startswith("s3"): local_bam_file = join(tmpdir, basename(bam_file)) download_file(bam_file, tmpdir, overwrite_ok=True) else: local_bam_file = bam_file return script(f""" samtools index {local_bam_file} samtools idxstats {local_bam_file} | grep -v '*' |cut -f3 | paste -sd+ | bc """)
def download_raw( juicebox: str, hic_file: str, chromosome: str, output_dir: str, resolution: int, tmpdir: str, ): return script( f""" #!/bin/bash {juicebox} dump observed NONE {hic_file} {chromosome} {chromosome} BP {resolution} {tmpdir}/chr{chromosome}.RAWobserved gzip -f {tmpdir}/chr{chromosome}.RAWobserved """, outputs=[Dir(output_dir).stage(Dir(tmpdir))], )
def download_observed_matrix( juicebox: str, hic_file: str, chromosome: str, output_dir: str, resolution: int, tmpdir: str, ): return script( f""" #!/bin/bash {juicebox} dump observed KR {hic_file} {chromosome} {chromosome} BP {resolution} {tmpdir}/chr{chromosome}.KRobserved gzip -f {tmpdir}/chr{chromosome}.KRobserved {juicebox} dump norm KR {hic_file} {chromosome} BP {resolution} {tmpdir}/chr{chromosome}.KRnorm gzip -f {tmpdir}/chr{chromosome}.KRnorm """, outputs=[Dir(output_dir).stage(Dir(tmpdir))], )
def make_tss_region_file(genes, output_dir, tmpdir, sizes, tss_slop=500): # Given a gene file, define 1kb regions around the tss of each gene sizes_pr = df_to_pyranges( pd.read_csv(sizes + ".bed", sep="\t", header=None).rename(columns={ 0: "chr", 1: "start", 2: "end" })) tss1kb = genes.loc[:, ["chr", "start", "end", "name", "score", "strand"]] tss1kb["start"] = genes["tss"] tss1kb["end"] = genes["tss"] tss1kb = df_to_pyranges(tss1kb).slack(tss_slop) tss1kb = pr.gf.genome_bounds(tss1kb, sizes_pr).df[[ "Chromosome", "Start", "End", "name", "score", "strand" ]] tss1kb.columns = ["chr", "start", "end", "name", "score", "strand"] tss1kb.sort_values(["chr", "start", "end"]) tss1kb_file = os.path.join(tmpdir, "GeneList.TSS1kb.bed") tss1kb.to_csv(tss1kb_file, header=False, index=False, sep="\t") local_chrom_sizes = join(tmpdir, basename(sizes)) tss1kb_out_file = join(output_dir, "GeneList.TSS1kb.bed") return script( f""" bedtools sort -faidx {local_chrom_sizes} -i {tss1kb_file} > {tss1kb_file}.sorted; mv {tss1kb_file}.sorted {tss1kb_file} """, inputs=[ File(join(output_dir, basename(sizes))).stage(File(local_chrom_sizes)) ], outputs={ "file": File(tss1kb_out_file).stage(tss1kb_file), "path": tss1kb_out_file, "df": tss1kb, }, )
def count_tagalign(tagalign: str, bed_file: str, output: str, chrom_sizes: str, tmpdir: str): return script(""" #!/bin/bash tabix -B {tagalign} {bed_file} | cut -f1-3 |bedtools coverage -counts -b stdin -a {bed_file} | awk '{{print $1"\t"$2"\t" $3"\t"$NF}}'>{output}""" )
def make_candidate_regions_from_peaks( count_file: str, macs_peaks: str, chrom_sizes: str, output_dir: str, tmpdir: str, n_enhancers: int = 175000, regions_whitelist: str = None, regions_blacklist: str = None, peak_extend: int = 250, minPeakWidth: int = 500, ): makedirs(output_dir) makedirs(tmpdir) outfile = join(output_dir, basename(macs_peaks) + ".candidateRegions.bed") # get tmpdir local files local_macs_peaks = join(tmpdir, basename(macs_peaks)) local_chrom_sizes = join(tmpdir, basename(chrom_sizes)) local_regions_whitelist = join(tmpdir, basename(regions_whitelist)) local_regions_blacklist = join(tmpdir, basename(regions_blacklist)) local_outfile = join(tmpdir, basename(outfile)) local_count_file = join(tmpdir, basename(count_file)) ## Generate enhancer regions from MACS narrowPeak - do not use summits if regions_whitelist: whitelist_command = ("(bedtools instersect -a " + local_regions_whitelist + " -b " + local_chrom_sizes_bed + " -wa | cut -f1-3 && cat ) | ") else: whitelist_command = "" if regions_blacklist: blacklist_command = ("bedtools intersect -v -wa -a stdin -b " + local_regions_blacklist + " | ") else: blacklist_command = "" # 2. Take top N regions, extend peaks (min size 500), merge, remove blacklist, add whitelist, sort and merge # use -sorted in intersect command? Not worth it, both files are small return script( f""" #!/bin/bash bedtools sort -i {local_count_file} -faidx {local_chrom_sizes} | bedtools merge -i stdin -c 4 -o max | sort -nr -k 4 | head -n {n_enhancers} | \ bedtools intersect -b stdin -a {local_macs_peaks} -wa | \ bedtools slop -i stdin -b {peak_extend} -g {local_chrom_sizes} | \ awk '{{ l=$3-$2; if (l < {minPeakWidth}) {{ $2 = $2 - int(({minPeakWidth}-l)/2); $3 = $3 + int(({minPeakWidth}-l)/2) }} print $1"\t"$2"\t"$3}}' | \ bedtools sort -i stdin -faidx {local_chrom_sizes} | \ bedtools merge -i stdin | \ blacklist_command \ cut -f 1-3 | {local_whitelist_command} \ bedtools sort -i stdin -faidx {local_chrom_sizes} | bedtools merge -i stdin > {local_outfile} """, inputs=[ File(counts_file).stage(File(local_count_file)), File(chrom_sizes).stage(File(local_chrom_sizes)), File(macs_peaks).stage(File(local_macs_peaks)), File(regions_whitelist).stage(File(local_regions_whitelist)), File(regions_blacklist).stage(File(local_regions_blacklist)), ], outputs={ "candidate_enhancer_regions_file": File(outfile).stage(local_outfile), "candidate_enhancer_regions_path": outfile, }, )
def make_candidate_regions_from_summits( count_file: str, macs_peaks: str, chrom_sizes: str, output_dir: str, tmpdir: str, regions_whitelist: str = None, regions_blacklist: str = None, n_enhancers: int = 175000, peak_extend: int = 250, ): ## Generate enhancer regions from MACS summits # 1. Count reads in dhs peaks # 2. Take top N regions, get summits, extend summits, merge makedirs(output_dir) makedirs(tmpdir) outfile = join(output_dir, basename(macs_peaks) + ".candidateRegions.bed") chrom_sizes_bed = ".".join([chrom_sizes, "bed"]) # get tmpdir local files local_macs_peaks = join(tmpdir, basename(macs_peaks)) local_chrom_sizes = join(tmpdir, basename(chrom_sizes)) local_chrom_sizes_bed = ".".join([local_chrom_sizes, "bed"]) local_regions_whitelist = join(tmpdir, basename(regions_whitelist)) local_regions_blacklist = join(tmpdir, basename(regions_blacklist)) local_outfile = join(tmpdir, basename(outfile)) local_count_file = join(tmpdir, basename(count_file)) if regions_whitelist: whitelist_command = ("( bedtools intersect -a " + local_regions_whitelist + " -b " + local_chrom_sizes_bed + " -wa | cut -f1-3 && cat ) | ") else: whitelist_command = "" if regions_blacklist: blacklist_command = ("bedtools intersect -v -wa -a stdin -b " + local_regions_blacklist + " | ") else: blacklist_command = "" # 2. Take top N regions, get summits, extend summits, merge, remove blacklist, add whitelist, sort and merge # use -sorted in intersect command? Not worth it, both files are small return script( f""" #!/bin/bash bedtools sort -i {local_count_file} -faidx {local_chrom_sizes} | bedtools merge -i stdin -c 4 -o max | sort -nr -k 4 | head -n {n_enhancers} | \ bedtools intersect -b stdin -a {local_macs_peaks} -wa | \ awk '{{print $1"\t"$2 + $10"\t"$2 + $10}}' | \ bedtools slop -i stdin -b {peak_extend} -g {local_chrom_sizes} | \ bedtools sort -i stdin -faidx {local_chrom_sizes} | \ bedtools merge -i stdin | \ {blacklist_command} \ cut -f 1-3 | {whitelist_command} \ bedtools sort -i stdin -faidx {local_chrom_sizes} | bedtools merge -i stdin > {local_outfile} """, inputs=[ File(count_file).stage(File(local_count_file)), File(chrom_sizes).stage(File(local_chrom_sizes)), File(chrom_sizes_bed).stage(File(local_chrom_sizes_bed)), File(macs_peaks).stage(File(local_macs_peaks)), File(regions_whitelist).stage(File(local_regions_whitelist)), File(regions_blacklist).stage(File(local_regions_blacklist)), ], outputs={ "candidate_enhancer_regions_file": File(outfile).stage(local_outfile), "candidate_enhancer_regions_path": outfile, }, )