def processCellType(args): os.makedirs(os.path.join(args.outDir), exist_ok=True) write_params(args, os.path.join(args.outDir, "params.txt")) #Make candidate regions if not args.ignoreSummits: make_candidate_regions_from_summits( macs_peaks=args.narrowPeak, accessibility_file=args.bam, genome_sizes=args.chrom_sizes, regions_includelist=args.regions_includelist, regions_blocklist=args.regions_blocklist, n_enhancers=args.nStrongestPeaks, peak_extend=args.peakExtendFromSummit, outdir=args.outDir) else: make_candidate_regions_from_peaks( macs_peaks=args.narrowPeak, accessibility_file=args.bam, genome_sizes=args.chrom_sizes, regions_includelist=args.regions_includelist, regions_blocklist=args.regions_blocklist, n_enhancers=args.nStrongestPeaks, peak_extend=args.peakExtendFromSummit, minPeakWidth=args.minPeakWidth, outdir=args.outDir)
def makeCandidateRegions( narrowPeak: str, input_bam: str, output_dir: str, chrom_sizes: str, tmpdir: str, regions_blacklist: str = None, regions_whitelist: str = None, peakExtendFromSummit: int = 250, nStrongestPeaks: int = 175000, ignoreSummits: bool = False, minPeakWidth: int = 500, ): """ Inputs: narrowPeak: narrowPeak file output by macs2. Must include summits (--call-summits) input_bam: DNAase-Seq or atac-Seq input_bam file chrom_sizes: File listing chromosome size annotations output_dir: output folder where results will be stored; this is created if it doesn't exist nStrongestPeaks: Number of peaks to use for defining candidate regions peakExtendFromSummit: Number of base pairs to extend each preak from its summit (or from both ends of region if using --ignoreSummits) ignoreSummits: Compute peaks using the full peak regions, rather than extending from summit. minPeakWidth: Candidate regions whose width is below this threshold are expanded to this width. Only used with --ignoreSummits regions_whitelist: Bed file of regions to forcibly include in candidate enhancers. Overrides regions_blacklist regions_blacklist: Bed file of regions to forcibly exclude from candidate enhancers """ # create output directory. # if output directory is in s3, create local directory from it's basename to serve as workdir makedirs(output_dir) makedirs(tmpdir) write_params( { "narrowPeak": narrowPeak, "input_bam": input_bam, "output_dir": output_dir, "tmpdir": tmpdir, "chrom_sizes": chrom_sizes, "regions_blacklist": regions_blacklist, "regions_whitelist": regions_whitelist, "peakExtendFromSummit": peakExtendFromSummit, "nStrongestPeaks": nStrongestPeaks, "ignoreSummits": ignoreSummits, "minPeakWidth": minPeakWidth, }, output_dir, tmpdir, "parameters.txt", ) # 1. Count dhs/atac reads in candidate regions raw_counts_outfile = join( output_dir, basename(narrowPeak) + "." + basename(input_bam) + ".Counts.bed") run_count_reads_out = run_count_reads( target=input_bam, output=raw_counts_outfile, output_dir=output_dir, tmpdir=tmpdir, bed_file=narrowPeak, chrom_sizes=chrom_sizes, use_fast_count=True, ) # Make candidate regions if not ignoreSummits: return make_candidate_regions_from_summits( count_file=run_count_reads_out["path"], macs_peaks=narrowPeak, chrom_sizes=chrom_sizes, regions_whitelist=regions_whitelist, regions_blacklist=regions_blacklist, n_enhancers=nStrongestPeaks, peak_extend=peakExtendFromSummit, output_dir=output_dir, tmpdir=tmpdir, ) else: return make_candidate_regions_from_peaks( count_file=run_count_reads_out["path"], macs_peaks=narrowPeak, chrom_sizes=chrom_sizes, regions_whitelist=regions_whitelist, regions_blacklist=regions_blacklist, n_enhancers=nStrongestPeaks, peak_extend=peakExtendFromSummit, minPeakWidth=minPeakWidth, output_dir=output_dir, tmpdir=tmpdir, )
def main(): args = parseargs() os.makedirs(args.outDir, exist_ok=True) # Write params file write_params(args, os.path.join(args.outDir, "params.txt")) # Parse cell types cell_types = args.celltypes.split(",") # chromosomes = ['chr' + str(x) for x in range(1,23)] + ['chrX'] # chromosomes = ['chr22'] special_value = np.Inf # for chromosome in chromosomes: hic_list = [ process_chr( cell_type, args.chromosome, args.basedir, args.resolution, args.ref_scale, args.ref_gamma, special_value, ) for cell_type in cell_types ] hic_list = [x for x in hic_list if x is not None] hic_list = [df.set_index(["bin1", "bin2"]) for df in hic_list] # Make average # Merge all hic matrices # Need to deal with nan vs 0 here. In the KR normalized matrices there are nan which we want to deal as missing. # Rows that are not present in the hic dataframe should be considered 0 # But after doing an outer join these rows will be represented as nan in the merged dataframe. # So need a way to distinguish nan vs 0. # Hack: convert all nan in the celltype specific hic dataframes to a special value. Then replace this special value after merging # TO DO: This is very memory intensive! (consider pandas.join or pandas.concat) # import pdb all_hic = pd.concat(hic_list, axis=1, join="outer", copy=False) hic_list = None # Clear from memory all_hic.fillna(value=0, inplace=True) all_hic.replace(to_replace=special_value, value=np.nan, inplace=True) # compute the average cols_for_avg = list(filter(lambda x: "hic_kr" in x, all_hic.columns)) # all_hic['avg_hic'] = all_hic[cols_for_avg].mean(axis=1) # avg_hic = all_hic[cols_for_avg].mean(axis=1) avg_hic = all_hic.mean(axis=1) num_good = len(cols_for_avg) - np.isnan(all_hic).sum(axis=1) # Check minimum number of cols all_hic.drop(cols_for_avg, inplace=True, axis=1) all_hic.reset_index(level=all_hic.index.names, inplace=True) all_hic["avg_hic"] = avg_hic.values all_hic.loc[num_good.values < args.min_cell_types_required, "avg_hic"] = np.nan # Setup final matrix all_hic["bin1"] = all_hic["bin1"] * args.resolution all_hic["bin2"] = all_hic["bin2"] * args.resolution all_hic = all_hic.loc[ np.logical_or(all_hic["avg_hic"] > 0, np.isnan(all_hic["avg_hic"])), ] # why do these 0's exist? os.makedirs(os.path.join(args.outDir, args.chromosome), exist_ok=True) all_hic.to_csv( os.path.join(args.outDir, args.chromosome, args.chromosome + ".avg.gz"), sep="\t", header=False, index=False, compression="gzip", na_rep=np.nan, )